From 1a82b047c1efe3a97983fc4f297a310679ef8082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 09:29:07 +0200 Subject: [PATCH 01/23] Removed reduction and multireduction declarations for MIC They are not implemented anyway... --- .../Containers/Algorithms/Multireduction.h | 20 +------------ .../Algorithms/Multireduction_impl.h | 14 --------- src/TNL/Containers/Algorithms/Reduction.h | 30 ------------------- 3 files changed, 1 insertion(+), 63 deletions(-) diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h index 42b8bf28d..6f64d31d5 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.h +++ b/src/TNL/Containers/Algorithms/Multireduction.h @@ -14,16 +14,13 @@ #include #include -#include namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > -class Multireduction -{ -}; +class Multireduction; template<> class Multireduction< Devices::Cuda > @@ -55,21 +52,6 @@ public: typename Operation::ResultType* hostResult ); }; -template<> -class Multireduction< Devices::MIC > -{ -public: - template< typename Operation, typename Index > - static void - reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); -}; - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h index ebd0ad256..3bc0166bd 100644 --- a/src/TNL/Containers/Algorithms/Multireduction_impl.h +++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h @@ -308,20 +308,6 @@ reduce( Operation& operation, #endif } -template< typename Operation, typename Index > -void -Multireduction< Devices::MIC >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* result ) -{ - throw Exceptions::NotImplementedError("Multireduction is not implemented for MIC."); -} - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index e2c6d5295..1f0c10a74 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -16,7 +16,6 @@ #include #include -#include namespace TNL { namespace Containers { @@ -83,35 +82,6 @@ class Reduction< Devices::Cuda > const Result& zero ); }; -template<> -class Reduction< Devices::MIC > -{ - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); - - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); -}; - } // namespace Algorithms } // namespace Containers } // namespace TNL -- GitLab From e470040a4ffe706ffd31c24c14ca5c6850cc9fb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 14:58:27 +0200 Subject: [PATCH 02/23] Style changes in the code for reduction --- .../Algorithms/CudaReductionKernel.h | 467 +++++++++--------- src/TNL/Containers/Algorithms/Reduction.h | 96 ++-- src/TNL/Containers/Algorithms/Reduction.hpp | 150 ++---- 3 files changed, 327 insertions(+), 386 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 89e13a272..76c8f81b3 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -12,20 +12,17 @@ #include // std::pair -#ifdef HAVE_CUDA -#include -#endif - #include #include #include #include +#include +#include namespace TNL { namespace Containers { namespace Algorithms { -#ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given @@ -34,6 +31,7 @@ namespace Algorithms { static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value +#ifdef HAVE_CUDA // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 8; @@ -42,11 +40,11 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #endif template< int blockSize, - typename Result, - typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index > + typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, @@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero, const Index size, Result* output ) { - using IndexType = Index; - using ResultType = Result; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. - const IndexType tid = threadIdx.x; - IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; + const Index tid = threadIdx.x; + Index gid = blockIdx.x * blockDim. x + threadIdx.x; + const Index gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; - // Read data into the shared memory. We start with the sequential reduction. + // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); @@ -111,28 +106,23 @@ CudaReductionKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { - volatile ResultType* vsdata = sdata; - if( blockSize >= 64 ) { + volatile Result* vsdata = sdata; + if( blockSize >= 64 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); - } // Note that here we do not have to check if tid < 16 etc, because we have - // twice as much shared memory, so we do not access out of bounds. The - // results for the upper half will be undefined, but unused anyway. - if( blockSize >= 32 ) { + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSize >= 32 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSize >= 16 ) { + if( blockSize >= 16 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSize >= 8 ) { + if( blockSize >= 8 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSize >= 4 ) { + if( blockSize >= 4 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSize >= 2 ) { + if( blockSize >= 2 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); - } } // Store the result back in the global memory. @@ -141,11 +131,11 @@ CudaReductionKernel( const Result zero, } template< int blockSize, - typename Result, - typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index > + typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionWithArgumentKernel( const Result zero, @@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { - using IndexType = Index; - using ResultType = Result; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); - IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] ); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); // Get the thread id (tid), global thread id (gid) and gridSize. - const IndexType tid = threadIdx.x; - IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; + const Index tid = threadIdx.x; + Index gid = blockIdx.x * blockDim. x + threadIdx.x; + const Index gridSize = blockDim.x * gridDim.x; - // Read data into the shared memory. We start with the sequential reduction. + // Start with the sequential reduction and push the result into the shared memory. if( idxInput ) { if( gid < size ) { sdata[ tid ] = dataFetcher( gid ); @@ -245,29 +232,24 @@ CudaReductionWithArgumentKernel( const Result zero, // This runs in one warp so it is synchronized implicitly. if( tid < 32 ) { - volatile ResultType* vsdata = sdata; - volatile IndexType* vsidx = sidx; - if( blockSize >= 64 ) { + volatile Result* vsdata = sdata; + volatile Index* vsidx = sidx; + if( blockSize >= 64 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); - } // Note that here we do not have to check if tid < 16 etc, because we have - // twice as much shared memory, so we do not access out of bounds. The - // results for the upper half will be undefined, but unused anyway. - if( blockSize >= 32 ) { + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSize >= 32 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSize >= 16 ) { + if( blockSize >= 16 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSize >= 8 ) { + if( blockSize >= 8 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSize >= 4 ) { + if( blockSize >= 4 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSize >= 2 ) { + if( blockSize >= 2 ) volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); - } } // Store the result back in the global memory. @@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero, idxOutput[ blockIdx.x ] = sidx[ 0 ]; } } +#endif template< typename Index, typename Result > struct CudaReductionKernelLauncher { - using IndexType = Index; - using ResultType = Result; - // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, - ResultType*& output ) + Result*& output ) { // create reference to the reduction buffer singleton and set size - const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType ); + const std::size_t buf_size = 2 * desGridSize * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); + output = cudaReductionBuffer.template getData< Result >(); this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); return this->reducedSize; @@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, - ResultType*& output, - IndexType*& idxOutput ) + Result*& output, + Index*& idxOutput ) { // create reference to the reduction buffer singleton and set size - const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) ); + const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); - idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] ); + output = cudaReductionBuffer.template getData< Result >(); + idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] ); this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); return this->reducedSize; @@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - ResultType* input = cudaReductionBuffer.template getData< ResultType >(); - ResultType* output = &input[ desGridSize ]; + Result* input = cudaReductionBuffer.template getData< Result >(); + Result* output = &input[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes - auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); std::swap( input, output ); } @@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher std::swap( input, output ); // Copy result on CPU - ResultType result; + Result result; ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 ); return result; } @@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - ResultType* input = cudaReductionBuffer.template getData< ResultType >(); - ResultType* output = &input[ desGridSize ]; - IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] ); - IndexType* idxOutput = &idxInput[ desGridSize ]; + Result* input = cudaReductionBuffer.template getData< Result >(); + Result* output = &input[ desGridSize ]; + Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] ); + Index* idxOutput = &idxInput[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes - auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); std::swap( input, output ); std::swap( idxInput, idxOutput ); @@ -426,95 +406,99 @@ struct CudaReductionKernelLauncher const Result& zero, Result* output ) { +#ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * sizeof( ResultType ) - : blockSize.x * sizeof( ResultType ); + const Index shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * sizeof( Result ) + : blockSize.x * sizeof( Result ); - // This is "general", but this method always sets blockSize.x to a specific value, - // so runtime switch is not necessary - it only prolongs the compilation time. + // This is "general", but this method always sets blockSize.x to a specific value, + // so runtime switch is not necessary - it only prolongs the compilation time. /* - // Depending on the blockSize we generate appropriate template instance. - switch( blockSize.x ) - { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - TNL_CHECK_CUDA_DEVICE; + // Depending on the blockSize we generate appropriate template instance. + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; */ - // Check just to future-proof the code setting blockSize.x - if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + // Check just to future-proof the code setting blockSize.x + if( blockSize.x == Reduction_maxThreadsPerBlock ) { + cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - CudaReductionKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - } - else { - TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); - } + CudaReductionKernel< Reduction_maxThreadsPerBlock > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + } + else { + TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); + } - // Return the size of the output array on the CUDA device - return gridSize.x; + // Return the size of the output array on the CUDA device + return gridSize.x; +#else + throw Exceptions::CudaSupportMissing(); +#endif } template< typename DataFetcher, @@ -529,105 +513,108 @@ struct CudaReductionKernelLauncher Index* idxOutput, const Index* idxInput ) { +#ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ) - : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ); + const Index shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) + : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); - // This is "general", but this method always sets blockSize.x to a specific value, - // so runtime switch is not necessary - it only prolongs the compilation time. + // This is "general", but this method always sets blockSize.x to a specific value, + // so runtime switch is not necessary - it only prolongs the compilation time. /* - // Depending on the blockSize we generate appropriate template instance. - switch( blockSize.x ) - { - case 512: - CudaReductionWithArgumentKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - TNL_CHECK_CUDA_DEVICE; + // Depending on the blockSize we generate appropriate template instance. + switch( blockSize.x ) + { + case 512: + CudaReductionWithArgumentKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; */ - // Check just to future-proof the code setting blockSize.x - if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + // Check just to future-proof the code setting blockSize.x + if( blockSize.x == Reduction_maxThreadsPerBlock ) { + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - } - else { - TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); - } + CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + } + else { + TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); + } - // return the size of the output array on the CUDA device - return gridSize.x; + // return the size of the output array on the CUDA device + return gridSize.x; +#else + throw Exceptions::CudaSupportMissing(); +#endif } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; - const IndexType originalSize; - IndexType reducedSize; + const Index originalSize; + Index reducedSize; }; -#endif } // namespace Algorithms } // namespace Containers diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index 1f0c10a74..6e5f9c12a 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -22,64 +22,62 @@ namespace Containers { namespace Algorithms { template< typename Device > -class Reduction; +struct Reduction; template<> -class Reduction< Devices::Host > +struct Reduction< Devices::Host > { - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static std::pair< Index, Result > + reduceWithArgument( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> -class Reduction< Devices::Cuda > +struct Reduction< Devices::Cuda > { - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static std::pair< Index, Result > + reduceWithArgument( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 9d54fd7b1..fd1d781c7 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -12,13 +12,11 @@ #pragma once +#include // std::unique_ptr //#define CUDA_REDUCTION_PROFILING -#include -#include #include -#include #include #include @@ -53,24 +51,21 @@ reduce( const Index size, DataFetcher& dataFetcher, const Result& zero ) { - using IndexType = Index; - using ResultType = Result; - constexpr int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable - ResultType result = zero; + Result result = zero; #pragma omp parallel { // initialize array for thread-local results - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Result r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { reduction( r[ 0 ], dataFetcher( offset + i ) ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); @@ -82,7 +77,7 @@ reduce( const Index size, // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( r[ 0 ], dataFetcher( i ) ); } @@ -103,11 +98,11 @@ reduce( const Index size, #endif if( blocks > 1 ) { // initialize array for unrolled results - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Result r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { reduction( r[ 0 ], dataFetcher( offset + i ) ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); @@ -117,7 +112,7 @@ reduce( const Index size, } // reduction of the last, incomplete block (not unrolled) - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); @@ -128,8 +123,8 @@ reduce( const Index size, return r[ 0 ]; } else { - ResultType result = zero; - for( IndexType i = 0; i < size; i++ ) + Result result = zero; + for( Index i = 0; i < size; i++ ) reduction( result, dataFetcher( i ) ); return result; } @@ -151,9 +146,6 @@ reduceWithArgument( const Index size, DataFetcher& dataFetcher, const Result& zero ) { - using IndexType = Index; - using ResultType = Result; - constexpr int block_size = 128; const int blocks = size / block_size; @@ -164,13 +156,13 @@ reduceWithArgument( const Index size, #pragma omp parallel { // initialize array for thread-local results - IndexType arg[ 4 ] = { 0, 0, 0, 0 }; - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Index arg[ 4 ] = { 0, 0, 0, 0 }; + Result r[ 4 ] = { zero, zero, zero, zero }; bool initialized( false ); #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { if( ! initialized ) { arg[ 0 ] = offset + i; @@ -194,7 +186,7 @@ reduceWithArgument( const Index size, // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); } @@ -217,13 +209,13 @@ reduceWithArgument( const Index size, #endif if( blocks > 1 ) { // initialize array for unrolled results - IndexType arg[ 4 ] = { 0, 0, 0, 0 }; - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Index arg[ 4 ] = { 0, 0, 0, 0 }; + Result r[ 4 ] = { zero, zero, zero, zero }; bool initialized( false ); // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { if( ! initialized ) { @@ -246,7 +238,7 @@ reduceWithArgument( const Index size, } // reduction of the last, incomplete block (not unrolled) - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); // reduction of unrolled results @@ -257,7 +249,7 @@ reduceWithArgument( const Index size, } else { std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); - for( IndexType i = 1; i < size; i++ ) + for( Index i = 1; i < size; i++ ) reduction( result.first, i, result.second, dataFetcher( i ) ); return result; } @@ -279,18 +271,10 @@ reduce( const Index size, DataFetcher& dataFetcher, const Result& zero ) { -#ifdef HAVE_CUDA - - using IndexType = Index; - using ResultType = Result; - - /*** - * Only fundamental and pointer types can be safely reduced on host. Complex - * objects stored on the device might contain pointers into the device memory, - * in which case reduction on host might fail. - */ - //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; - constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; + // Only fundamental and pointer types can be safely reduced on host. Complex + // objects stored on the device might contain pointers into the device memory, + // in which case reduction on host might fail. + constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value; #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -298,18 +282,17 @@ reduce( const Index size, timer.start(); #endif - CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + CudaReductionKernelLauncher< Index, Result > reductionLauncher( size ); - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1( 0 ); - IndexType reducedSize = reductionLauncher.start( + // start the reduction on the GPU + Result* deviceAux1( 0 ); + Index reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, zero, deviceAux1 ); + #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -318,10 +301,8 @@ reduce( const Index size, #endif if( can_reduce_later_on_host ) { - /*** - * Transfer the reduced data from device to host. - */ - std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -331,11 +312,9 @@ reduce( const Index size, timer.start(); #endif - /*** - * Reduce the data on the host system. - */ - auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); + // finish the reduction on the host + auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; + const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -344,9 +323,7 @@ reduce( const Index size, return result; } else { - /*** - * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. - */ + // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING @@ -358,10 +335,7 @@ reduce( const Index size, return result; } -#else - throw Exceptions::CudaSupportMissing(); -#endif -}; +} template< typename Index, typename Result, @@ -376,18 +350,10 @@ reduceWithArgument( const Index size, DataFetcher& dataFetcher, const Result& zero ) { -#ifdef HAVE_CUDA - - using IndexType = Index; - using ResultType = Result; - - /*** - * Only fundamental and pointer types can be safely reduced on host. Complex - * objects stored on the device might contain pointers into the device memory, - * in which case reduction on host might fail. - */ - //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; - constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; + // Only fundamental and pointer types can be safely reduced on host. Complex + // objects stored on the device might contain pointers into the device memory, + // in which case reduction on host might fail. + constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value; #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -395,20 +361,19 @@ reduceWithArgument( const Index size, timer.start(); #endif - CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + CudaReductionKernelLauncher< Index, Result > reductionLauncher( size ); - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1( nullptr ); - IndexType* deviceIndexes( nullptr ); - IndexType reducedSize = reductionLauncher.startWithArgument( + // start the reduction on the GPU + Result* deviceAux1( nullptr ); + Index* deviceIndexes( nullptr ); + Index reducedSize = reductionLauncher.startWithArgument( reduction, volatileReduction, dataFetcher, zero, deviceAux1, deviceIndexes ); + #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -417,11 +382,9 @@ reduceWithArgument( const Index size, #endif if( can_reduce_later_on_host ) { - /*** - * Transfer the reduced data from device to host. - */ - std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; - std::unique_ptr< IndexType[] > indexArray{ new IndexType[ reducedSize ] }; + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; + std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); @@ -432,12 +395,10 @@ reduceWithArgument( const Index size, timer.start(); #endif - /*** - * Reduce the data on the host system. - */ - //auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - //const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero ); - for( IndexType i = 1; i < reducedSize; i++ ) + // finish the reduction on the host +// auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; +// const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero ); + for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); #ifdef CUDA_REDUCTION_PROFILING @@ -447,9 +408,7 @@ reduceWithArgument( const Index size, return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] ); } else { - /*** - * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. - */ + // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING @@ -461,9 +420,6 @@ reduceWithArgument( const Index size, return result; } -#else - throw Exceptions::CudaSupportMissing(); -#endif } } // namespace Algorithms -- GitLab From b74a24d22e012ce1a25ad89ed906482a23588540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 15:13:16 +0200 Subject: [PATCH 03/23] Rewritten multireduction with lambda functions --- .../Algorithms/CudaMultireductionKernel.h | 249 ++++++-------- .../Containers/Algorithms/Multireduction.h | 70 ++-- .../Containers/Algorithms/Multireduction.hpp | 231 +++++++++++++ .../Algorithms/Multireduction_impl.h | 313 ------------------ src/TNL/Solvers/Linear/GMRES.h | 7 + src/TNL/Solvers/Linear/GMRES_impl.h | 53 ++- src/UnitTests/Containers/MultireductionTest.h | 51 ++- 7 files changed, 471 insertions(+), 503 deletions(-) create mode 100644 src/TNL/Containers/Algorithms/Multireduction.hpp delete mode 100644 src/TNL/Containers/Algorithms/Multireduction_impl.h diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index e8fc7f8bb..47919b351 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -12,14 +12,11 @@ #pragma once -#ifdef HAVE_CUDA -#include -#endif - #include #include #include #include +#include namespace TNL { namespace Containers { @@ -32,157 +29,128 @@ namespace Algorithms { * architecture so that there are no local memory spills. */ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2 -static constexpr int Multireduction_registersPerThread = 38; // empirically determined optimal value +static constexpr int Multireduction_registersPerThread = 32; // empirically determined optimal value // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) - static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; + static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #endif -template< int blockSizeX, typename Operation, typename Index > +template< int blockSizeX, + typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) -CudaMultireductionKernel( Operation operation, - const int n, +CudaMultireductionKernel( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* output ) + const int n, + Result* output ) { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); - - /*** - * Get thread id (tid) and global element id (gid). - * gridSizeX is the number of elements in the direction of x-axis - * processed by all blocks at the same time. - */ - const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x; - IndexType gid = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType gridSizeX = blockDim.x * gridDim.x; - - /*** - * Shift input1 and output pointers. - */ - const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; - if( y < n ) { - input1 += y * ldInput1; - output += y * gridDim.x; - } - else - return; - - /*** - * Start with the sequential reduction and push the - * result into the shared memory. - */ - sdata[ tid ] = operation.initialValue(); - while( gid + 4 * gridSizeX < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + + // Get the thread id (tid), global thread id (gid) and gridSize. + const Index tid = threadIdx.y * blockDim.x + threadIdx.x; + Index gid = blockIdx.x * blockDim.x + threadIdx.x; + const Index gridSizeX = blockDim.x * gridDim.x; + + // Get the dataset index. + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if( y >= n ) return; + + sdata[ tid ] = zero; + + // Start with the sequential reduction and push the result into the shared memory. + while( gid + 4 * gridSizeX < size ) { + reduction( sdata[ tid ], dataFetcher( gid, y ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); + reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) ); + reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) ); gid += 4 * gridSizeX; } - while( gid + 2 * gridSizeX < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); + while( gid + 2 * gridSizeX < size ) { + reduction( sdata[ tid ], dataFetcher( gid, y ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); gid += 2 * gridSizeX; } - while( gid < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); + while( gid < size ) { + reduction( sdata[ tid ], dataFetcher( gid, y ) ); gid += gridSizeX; } __syncthreads(); - - //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); - - /*** - * Perform the parallel reduction. - */ + // Perform the parallel reduction. if( blockSizeX >= 1024 ) { - if( threadIdx.x < 512 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); - } + if( threadIdx.x < 512 ) + reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSizeX >= 512 ) { - if( threadIdx.x < 256 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); - } + if( threadIdx.x < 256 ) + reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSizeX >= 256 ) { - if( threadIdx.x < 128 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); - } + if( threadIdx.x < 128 ) + reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSizeX >= 128 ) { - if( threadIdx.x < 64 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); - } + if( threadIdx.x < 64 ) + reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } - /*** - * This runs in one warp so it is synchronized implicitly. - * - * When the blockSizeX is less then or equal to the warp size, the shared memory - * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory - * will be accessed !!! - */ + // This runs in one warp so it is synchronized implicitly. if( threadIdx.x < 32 ) { - volatile ResultType* vsdata = sdata; - if( blockSizeX >= 64 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); - } - if( blockSizeX >= 32 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSizeX >= 16 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSizeX >= 8 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSizeX >= 4 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSizeX >= 2 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); - } + volatile Result* vsdata = sdata; + if( blockSizeX >= 64 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + // Note that here we do not have to check if tid < 16 etc, because we have + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSizeX >= 32 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + if( blockSizeX >= 16 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + if( blockSizeX >= 8 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + if( blockSizeX >= 4 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + if( blockSizeX >= 2 ) + volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); } - /*** - * Store the result back in the global memory. - */ + // Store the result back in the global memory. if( threadIdx.x == 0 ) { - output[ blockIdx.x ] = sdata[ tid ]; + output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ]; } } +#endif -template< typename Operation, typename Index > +template< typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > int -CudaMultireductionKernelLauncher( Operation& operation, - const int n, +CudaMultireductionKernelLauncher( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType*& output ) + const int n, + Result*& output ) { - typedef typename Operation::ResultType ResultType; - +#ifdef HAVE_CUDA // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -197,7 +165,7 @@ CudaMultireductionKernelLauncher( Operation& operation, / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread ); const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); dim3 blockSize, gridSize; - + // version A: max 16 rows of threads blockSize.y = TNL::min( n, 16 ); @@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation, // create reference to the reduction buffer singleton and set size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) - const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); + const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); + output = cudaReductionBuffer.template getData< Result >(); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * blockSize.y * sizeof( ResultType ) - : blockSize.x * blockSize.y * sizeof( ResultType ); - - //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem < - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 256: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 128: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 64: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 32: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 16: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 8: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 4: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 2: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); default: throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } + cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; // return the size of the output array on the CUDA device return gridSize.x; -} +#else + throw Exceptions::CudaSupportMissing(); #endif +} } // namespace Algorithms } // namespace Containers diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h index 6f64d31d5..00ca6078a 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.h +++ b/src/TNL/Containers/Algorithms/Multireduction.h @@ -20,40 +20,70 @@ namespace Containers { namespace Algorithms { template< typename Device > -class Multireduction; +struct Multireduction; template<> -class Multireduction< Devices::Cuda > +struct Multireduction< Devices::Host > { -public: - template< typename Operation, typename Index > + /** + * Parameters: + * zero: starting value for reduction + * dataFetcher: callable object such that `dataFetcher( i, j )` yields + * the i-th value to be reduced from the j-th dataset + * (i = 0,...,size-1; j = 0,...,n-1) + * reduction: callable object representing the reduction operation + * volatileReduction: callable object representing the reduction operation + * size: the size of each dataset + * n: number of datasets to be reduced + * result: output array of size = n + */ + template< typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > static void - reduce( Operation& operation, - const int n, + reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); + const int n, + Result* result ); }; template<> -class Multireduction< Devices::Host > +struct Multireduction< Devices::Cuda > { -public: - template< typename Operation, typename Index > + /** + * Parameters: + * zero: starting value for reduction + * dataFetcher: callable object such that `dataFetcher( i, j )` yields + * the i-th value to be reduced from the j-th dataset + * (i = 0,...,size-1; j = 0,...,n-1) + * reduction: callable object representing the reduction operation + * volatileReduction: callable object representing the reduction operation + * size: the size of each dataset + * n: number of datasets to be reduced + * hostResult: output array of size = n + */ + template< typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > static void - reduce( Operation& operation, - const int n, + reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); + const int n, + Result* hostResult ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL -#include "Multireduction_impl.h" +#include "Multireduction.hpp" diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp new file mode 100644 index 000000000..f5c193c42 --- /dev/null +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -0,0 +1,231 @@ +/*************************************************************************** + Multireduction_impl.h - description + ------------------- + begin : May 13, 2016 + copyright : (C) 2016 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include // std::unique_ptr + +//#define CUDA_REDUCTION_PROFILING + +#include +#include +#include +#include + +#ifdef CUDA_REDUCTION_PROFILING +#include +#include +#endif + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > +void +Multireduction< Devices::Host >:: +reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, + const Index size, + const int n, + Result* result ) +{ + TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); + TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); + + constexpr int block_size = 128; + const int blocks = size / block_size; + +#ifdef HAVE_OPENMP + if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) +#pragma omp parallel + { + // first thread initializes the result array + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + } + + // initialize array for thread-local results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; + + #pragma omp for nowait + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } + } + } + + // the first thread that reaches here processes the last, incomplete block + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + } + + // local reduction of unrolled results + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + reduction( _r[ 0 ], _r[ 1 ] ); + reduction( _r[ 0 ], _r[ 2 ] ); + reduction( _r[ 0 ], _r[ 3 ] ); + } + + // inter-thread reduction of local results + #pragma omp critical + { + for( int k = 0; k < n; k++ ) + reduction( result[ k ], r[ 4 * k ] ); + } + } + else { +#endif + if( blocks > 1 ) { + // initialize array for unrolled results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } + } + } + + // reduction of the last, incomplete block (not unrolled) + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + + // reduction of unrolled results + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + reduction( _r[ 0 ], _r[ 1 ] ); + reduction( _r[ 0 ], _r[ 2 ] ); + reduction( _r[ 0 ], _r[ 3 ] ); + + // copy the result into the output parameter + result[ k ] = _r[ 0 ]; + } + } + else { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + for( int i = 0; i < block_size; i++ ) + reduction( result[ k ], dataFetcher( offset + i, k ) ); + } + } + + for( int k = 0; k < n; k++ ) { + for( Index i = blocks * block_size; i < size; i++ ) + reduction( result[ k ], dataFetcher( i, k ) ); + } + } +#ifdef HAVE_OPENMP + } +#endif +} + +template< typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > +void +Multireduction< Devices::Cuda >:: +reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, + const Index size, + const int n, + Result* hostResult ) +{ + TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); + TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); + + #ifdef CUDA_REDUCTION_PROFILING + Timer timer; + timer.reset(); + timer.start(); + #endif + + // start the reduction on the GPU + Result* deviceAux1 = nullptr; + const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] }; + ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + // finish the reduction on the host + auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; + Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; + #endif +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h deleted file mode 100644 index 3bc0166bd..000000000 --- a/src/TNL/Containers/Algorithms/Multireduction_impl.h +++ /dev/null @@ -1,313 +0,0 @@ -/*************************************************************************** - Multireduction_impl.h - description - ------------------- - begin : May 13, 2016 - copyright : (C) 2016 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "Multireduction.h" - -//#define CUDA_REDUCTION_PROFILING - -#include -#include -#include -#include -#include -#include - -#ifdef CUDA_REDUCTION_PROFILING -#include -#include -#endif - -namespace TNL { -namespace Containers { -namespace Algorithms { - -/**** - * Arrays smaller than the following constant are reduced on CPU. - */ -//static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256; -// TODO: benchmarks with different values -static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; - -/* - * Parameters: - * operation: the operation used for reduction - * n: number of datasets to be reduced - * size: the size of each dataset - * deviceInput1: input array of size = n * ldInput1 - * ldInput1: leading dimension of the deviceInput1 array - * deviceInput2: either nullptr or input array of size = size - * hostResult: output array of size = n - */ -template< typename Operation, typename Index > -void -Multireduction< Devices::Cuda >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ) -{ -#ifdef HAVE_CUDA - TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); - TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." ); - - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - typedef typename Operation::LaterReductionOperation LaterReductionOperation; - - /*** - * First check if the input array(s) is/are large enough for the multireduction on GPU. - * Otherwise copy it/them to host and multireduce on CPU. - */ - if( n * ldInput1 < Multireduction_minGpuDataSize ) { - DataType1 hostArray1[ Multireduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray1, deviceInput1, n * ldInput1 ); - if( deviceInput2 ) { - using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; - _DT2 hostArray2[ Multireduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray2, (_DT2*) deviceInput2, size ); - Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); - } - else { - Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); - } - return; - } - - #ifdef CUDA_REDUCTION_PROFILING - Timer timer; - timer.reset(); - timer.start(); - #endif - - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1 = nullptr; - const IndexType reducedSize = CudaMultireductionKernelLauncher( operation, - n, - size, - deviceInput1, - ldInput1, - deviceInput2, - deviceAux1 ); - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; - timer.reset(); - timer.start(); - #endif - - /*** - * Transfer the reduced data from device to host. - */ - ResultType resultArray[ n * reducedSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray, deviceAux1, n * reducedSize ); - - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; - timer.reset(); - timer.start(); - #endif - -// std::cout << "resultArray = ["; -// for( int i = 0; i < n * reducedSize; i++ ) { -// std::cout << resultArray[ i ]; -// if( i < n * reducedSize - 1 ) -// std::cout << ", "; -// } -// std::cout << "]" << std::endl; - - /*** - * Reduce the data on the host system. - */ - LaterReductionOperation laterReductionOperation; - Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (void*) nullptr, hostResult ); - - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif - - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif -}; - -/* - * Parameters: - * operation: the operation used for reduction - * n: number of datasets to be reduced - * size: the size of each dataset - * input1: input array of size = n * ldInput1 - * ldInput1: leading dimension of the input1 array - * input2: either nullptr or input array of size = size - * hostResult: output array of size = n - */ -template< typename Operation, typename Index > -void -Multireduction< Devices::Host >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* result ) -{ - TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); - TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." ); - - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - - constexpr int block_size = 128; - const int blocks = size / block_size; - -#ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) -#pragma omp parallel - { - // first thread initializes the result array - #pragma omp single nowait - { - for( int k = 0; k < n; k++ ) - result[ k ] = operation.initialValue(); - } - - // initialize array for thread-local results - // (it is accessed as a row-major matrix with n rows and 4 columns) - ResultType r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = operation.initialValue(); - - #pragma omp for nowait - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); - operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); - operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); - operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); - } - } - } - - // the first thread that reaches here processes the last, incomplete block - #pragma omp single nowait - { - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( _r[ 0 ], i, _input1, input2 ); - } - } - - // local reduction of unrolled results - for( int k = 0; k < n; k++ ) { - ResultType* _r = r + 4 * k; - operation.commonReduction( _r[ 0 ], _r[ 1 ] ); - operation.commonReduction( _r[ 0 ], _r[ 2 ] ); - operation.commonReduction( _r[ 0 ], _r[ 3 ] ); - } - - // inter-thread reduction of local results - #pragma omp critical - { - for( int k = 0; k < n; k++ ) - operation.commonReduction( result[ k ], r[ 4 * k ] ); - } - } - else { -#endif - if( blocks > 1 ) { - // initialize array for unrolled results - // (it is accessed as a row-major matrix with n rows and 4 columns) - ResultType r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = operation.initialValue(); - - // main reduction (explicitly unrolled loop) - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); - operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); - operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); - operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); - } - } - } - - // reduction of the last, incomplete block (not unrolled) - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( _r[ 0 ], i, _input1, input2 ); - } - - // reduction of unrolled results - for( int k = 0; k < n; k++ ) { - ResultType* _r = r + 4 * k; - operation.commonReduction( _r[ 0 ], _r[ 1 ] ); - operation.commonReduction( _r[ 0 ], _r[ 2 ] ); - operation.commonReduction( _r[ 0 ], _r[ 3 ] ); - - // copy the result into the output parameter - result[ k ] = _r[ 0 ]; - } - } - else { - for( int k = 0; k < n; k++ ) - result[ k ] = operation.initialValue(); - - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - for( int i = 0; i < block_size; i++ ) - operation.firstReduction( result[ k ], offset + i, _input1, input2 ); - } - } - - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( result[ k ], i, _input1, input2 ); - } - } -#ifdef HAVE_OPENMP - } -#endif -} - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index 1cc6901dc..dd72e2832 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -84,9 +84,16 @@ protected: void hauseholder_cwy( VectorViewType v, const int i ); +// nvcc allows __cuda_callable__ lambdas only in public methods +#ifdef __NVCC__ +public: +#endif void hauseholder_cwy_transposed( VectorViewType z, const int i, ConstVectorViewType w ); +#ifdef __NVCC__ +protected: +#endif template< typename Vector > void update( const int k, diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index f45741151..d1ef7bc97 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -427,14 +426,28 @@ hauseholder_generate( const int i, if( i > 0 ) { // aux = Y_{i-1}^T * y_i RealType aux[ i ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::Multireduction< DeviceType >::reduce +// ( scalarProduct, +// i, +// size, +// Y.getData(), +// ldSize, +// Traits::getConstLocalView( y_i ).getData(), +// aux ); + const RealType* _Y = Y.getData(); + const RealType* _y_i = Traits::getConstLocalView( y_i ).getData(); + const IndexType ldSize = this->ldSize; + auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; }; + auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i, + ( (RealType) 0, + fetch, + reduction, + volatileReduction, size, - Y.getData(), - ldSize, - Traits::getLocalView( y_i ).getData(), + i, aux ); // no-op if the problem is not distributed CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); @@ -525,14 +538,28 @@ hauseholder_cwy_transposed( VectorViewType z, { // aux = Y_i^T * w RealType aux[ i + 1 ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::Multireduction< DeviceType >::reduce +// ( scalarProduct, +// i + 1, +// size, +// Y.getData(), +// ldSize, +// Traits::getConstLocalView( w ).getData(), +// aux ); + const RealType* _Y = Y.getData(); + const RealType* _w = Traits::getConstLocalView( w ).getData(); + const IndexType ldSize = this->ldSize; + auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; }; + auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i + 1, + ( (RealType) 0, + fetch, + reduction, + volatileReduction, size, - Y.getData(), - ldSize, - Traits::getConstLocalView( w ).getData(), + i + 1, aux ); // no-op if the problem is not distributed Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h index 0487e916b..29d2800f3 100644 --- a/src/UnitTests/Containers/MultireductionTest.h +++ b/src/UnitTests/Containers/MultireductionTest.h @@ -94,29 +94,48 @@ using VectorTypes = ::testing::Types< TYPED_TEST_SUITE( MultireductionTest, VectorTypes ); -TYPED_TEST( MultireductionTest, scalarProduct ) +// idiot nvcc does not allow __cuda_callable__ lambdas inside private or protected regions +template< typename DeviceVector, typename HostVector > +void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVector& result ) { - using RealType = typename TestFixture::DeviceVector::RealType; - using DeviceType = typename TestFixture::DeviceVector::DeviceType; + using RealType = typename DeviceVector::RealType; + using DeviceType = typename DeviceVector::DeviceType; + using IndexType = typename DeviceVector::IndexType; + + const RealType* _V = V.getData(); + const RealType* _y = y.getData(); + const IndexType size = y.getSize(); + const int n = result.getSize(); + ASSERT_EQ( V.getSize(), size * n ); - ParallelReductionScalarProduct< RealType, RealType > scalarProduct; + auto fetch = [=] __cuda_callable__ ( IndexType i, int k ) + { + TNL_ASSERT_LT( i, size, "BUG: fetcher got invalid index i" ); + TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" ); + return _V[ i + k * size ] * _y[ i ]; + }; + auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Multireduction< DeviceType >::reduce - ( scalarProduct, - this->n, - this->size, - this->V.getData(), - this->size, - this->y.getData(), - this->result.getData() ); - - for( int i = 0; i < this->n; i++ ) { + ( (RealType) 0, + fetch, + reduction, + volatileReduction, + size, + n, + result.getData() ); + + for( int i = 0; i < n; i++ ) { if( i % 2 == 0 ) - EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) ); + EXPECT_EQ( result[ i ], 0.5 * size * ( size - 1 ) ); else - EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) ); + EXPECT_EQ( result[ i ], - 0.5 * size * ( size - 1 ) ); } } - +TYPED_TEST( MultireductionTest, scalarProduct ) +{ + test_multireduction( this->V, this->y, this->result ); +} #endif // HAVE_GTEST -- GitLab From cbc2fff9dc7be94518bb2a9872fa5f43b3134148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 18:57:47 +0200 Subject: [PATCH 04/23] Found a way to avoid using volatile in CUDA reduction: __syncwarp() The performance seems to be identical to the code using volatile. --- .../Algorithms/CudaMultireductionKernel.h | 58 +++---- .../Algorithms/CudaReductionKernel.h | 153 +++++++++--------- .../Containers/Algorithms/Multireduction.hpp | 2 +- src/TNL/Containers/Algorithms/Reduction.hpp | 6 +- 4 files changed, 104 insertions(+), 115 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index 47919b351..782e66eaa 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -42,14 +42,12 @@ template< int blockSizeX, typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result* output ) @@ -108,25 +106,29 @@ CudaMultireductionKernel( const Result zero, __syncthreads(); } - // This runs in one warp so it is synchronized implicitly. + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( threadIdx.x < 32 ) { - volatile Result* vsdata = sdata; if( blockSizeX >= 64 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + reduction( sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSizeX >= 32 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + reduction( sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); if( blockSizeX >= 16 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + reduction( sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); if( blockSizeX >= 8 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + reduction( sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); if( blockSizeX >= 4 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + reduction( sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); if( blockSizeX >= 2 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. @@ -139,13 +141,11 @@ CudaMultireductionKernel( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > int CudaMultireductionKernelLauncher( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result*& output ) @@ -215,55 +215,55 @@ CudaMultireductionKernelLauncher( const Result zero, { case 512: CudaMultireductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 256: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 128: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 64: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 32: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 16: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 8: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 4: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 2: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 76c8f81b3..03da1e556 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -43,14 +43,12 @@ template< int blockSize, typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, Result* output ) { @@ -104,25 +102,29 @@ CudaReductionKernel( const Result zero, __syncthreads(); } - // This runs in one warp so it is synchronized implicitly. + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { - volatile Result* vsdata = sdata; if( blockSize >= 64 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + reduction( sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + reduction( sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); if( blockSize >= 16 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + reduction( sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); if( blockSize >= 8 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + reduction( sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); if( blockSize >= 4 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + reduction( sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); if( blockSize >= 2 ) - volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. @@ -134,14 +136,12 @@ template< int blockSize, typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionWithArgumentKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, Result* output, Index* idxOutput, @@ -230,26 +230,29 @@ CudaReductionWithArgumentKernel( const Result zero, __syncthreads(); } - // This runs in one warp so it is synchronized implicitly. + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { - volatile Result* vsdata = sdata; - volatile Index* vsidx = sidx; if( blockSize >= 64 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); + reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); + reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); if( blockSize >= 16 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); + reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); if( blockSize >= 8 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); + reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); if( blockSize >= 4 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); + reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); if( blockSize >= 2 ) - volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); + reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. @@ -289,10 +292,8 @@ struct CudaReductionKernelLauncher } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int start( const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output ) @@ -303,15 +304,13 @@ struct CudaReductionKernelLauncher cudaReductionBuffer.setSize( buf_size ); output = cudaReductionBuffer.template getData< Result >(); - this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); + this->reducedSize = this->launch( originalSize, reduction, dataFetcher, zero, output ); return this->reducedSize; } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int startWithArgument( const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output, @@ -324,15 +323,13 @@ struct CudaReductionKernelLauncher output = cudaReductionBuffer.template getData< Result >(); idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] ); - this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); + this->reducedSize = this->launchWithArgument( originalSize, reduction, dataFetcher, zero, output, idxOutput, nullptr ); return this->reducedSize; } - template< typename Reduction, - typename VolatileReduction > + template< typename Reduction > Result finish( const Reduction& reduction, - const VolatileReduction& volatileReduction, const Result& zero ) { // Input is the first half of the buffer, output is the second half @@ -344,7 +341,7 @@ struct CudaReductionKernelLauncher { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; - this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); + this->reducedSize = this->launch( this->reducedSize, reduction, copyFetch, zero, output ); std::swap( input, output ); } @@ -358,11 +355,9 @@ struct CudaReductionKernelLauncher return result; } - template< typename Reduction, - typename VolatileReduction > + template< typename Reduction > std::pair< Index, Result > finishWithArgument( const Reduction& reduction, - const VolatileReduction& volatileReduction, const Result& zero ) { // Input is the first half of the buffer, output is the second half @@ -376,7 +371,7 @@ struct CudaReductionKernelLauncher { // this lambda has to be defined inside the loop, because the captured variable changes auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; - this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); + this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput ); std::swap( input, output ); std::swap( idxInput, idxOutput ); } @@ -397,11 +392,9 @@ struct CudaReductionKernelLauncher protected: template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int launch( const Index size, const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output ) @@ -425,55 +418,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); @@ -485,10 +478,10 @@ struct CudaReductionKernelLauncher // Check just to future-proof the code setting blockSize.x if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); } else { TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); @@ -502,11 +495,9 @@ struct CudaReductionKernelLauncher } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int launchWithArgument( const Index size, const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output, @@ -532,55 +523,55 @@ struct CudaReductionKernelLauncher { case 512: CudaReductionWithArgumentKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 256: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 128: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 64: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 32: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 16: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 8: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 4: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 2: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); @@ -592,10 +583,10 @@ struct CudaReductionKernelLauncher // Check just to future-proof the code setting blockSize.x if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); } else { TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp index f5c193c42..e96e8e649 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.hpp +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -196,7 +196,7 @@ reduce( const Result zero, // start the reduction on the GPU Result* deviceAux1 = nullptr; - const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 ); + const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index fd1d781c7..93bb14e3e 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -288,7 +288,6 @@ reduce( const Index size, Result* deviceAux1( 0 ); Index reducedSize = reductionLauncher.start( reduction, - volatileReduction, dataFetcher, zero, deviceAux1 ); @@ -324,7 +323,7 @@ reduce( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU - auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); + auto result = reductionLauncher.finish( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -368,7 +367,6 @@ reduceWithArgument( const Index size, Index* deviceIndexes( nullptr ); Index reducedSize = reductionLauncher.startWithArgument( reduction, - volatileReduction, dataFetcher, zero, deviceAux1, @@ -409,7 +407,7 @@ reduceWithArgument( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU - auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero ); + auto result = reductionLauncher.finishWithArgument( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); -- GitLab From 13b89a716de4fe702aca8a37ac219cf60e556e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 19:20:02 +0200 Subject: [PATCH 05/23] Removed volatile reduction completely --- .../BLAS/CommonVectorOperations.hpp | 51 ++++++------------ .../Algorithms/ArrayOperationsCuda.hpp | 9 ++-- .../Containers/Algorithms/Multireduction.h | 6 --- .../Containers/Algorithms/Multireduction.hpp | 6 +-- src/TNL/Containers/Algorithms/Reduction.h | 8 --- src/TNL/Containers/Algorithms/Reduction.hpp | 12 +---- src/TNL/Containers/Expressions/Comparison.h | 45 ++++++---------- .../DistributedExpressionTemplates.h | 24 +++------ .../DistributedVerticalOperations.h | 20 +------ .../Expressions/ExpressionTemplates.h | 24 +++------ .../Expressions/StaticExpressionTemplates.h | 12 ----- .../Expressions/VerticalOperations.h | 53 +++++-------------- src/TNL/Solvers/Linear/GMRES_impl.h | 4 -- src/TNL/Solvers/ODE/Euler.hpp | 3 +- src/TNL/Solvers/ODE/Merson_impl.h | 4 +- src/UnitTests/Containers/MultireductionTest.h | 2 - .../Containers/VectorEvaluateAndReduceTest.h | 9 ++-- 17 files changed, 70 insertions(+), 222 deletions(-) diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp index 727a0abcf..b665de0ea 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp +++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp @@ -31,8 +31,7 @@ getVectorMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -49,8 +48,7 @@ getVectorMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -67,8 +65,7 @@ getVectorAbsMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -85,8 +82,7 @@ getVectorAbsMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -103,8 +99,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -121,8 +116,7 @@ getVectorL2Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -146,8 +140,7 @@ getVectorLpNorm( const Vector& v, const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -167,8 +160,7 @@ getVectorSum( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -188,8 +180,7 @@ getVectorDifferenceMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -209,8 +200,7 @@ getVectorDifferenceMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -230,8 +220,7 @@ getVectorDifferenceAbsMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -251,8 +240,7 @@ getVectorDifferenceAbsMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -272,8 +260,7 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -296,8 +283,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, return diff * diff; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -324,8 +310,7 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -345,8 +330,7 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -366,8 +350,7 @@ getScalarProduct( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } } // namespace Benchmarks diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 8623c99de..120ef3801 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -135,8 +135,7 @@ compare( const Element1* destination, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } template< typename Element, @@ -153,8 +152,7 @@ containsValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, @@ -171,8 +169,7 @@ containsOnlyValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h index 00ca6078a..12ed22003 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.h +++ b/src/TNL/Containers/Algorithms/Multireduction.h @@ -32,7 +32,6 @@ struct Multireduction< Devices::Host > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation - * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * result: output array of size = n @@ -40,13 +39,11 @@ struct Multireduction< Devices::Host > template< typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result* result ); @@ -62,7 +59,6 @@ struct Multireduction< Devices::Cuda > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation - * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n @@ -70,13 +66,11 @@ struct Multireduction< Devices::Cuda > template< typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ); diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp index e96e8e649..2f1999303 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.hpp +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -33,14 +33,12 @@ namespace Algorithms { template< typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > void Multireduction< Devices::Host >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result* result ) @@ -173,14 +171,12 @@ reduce( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, - typename VolatileReduction, typename Index > void Multireduction< Devices::Cuda >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ) @@ -218,7 +214,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; - Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult ); + Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index 6e5f9c12a..d4406332b 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -30,24 +30,20 @@ struct Reduction< Devices::Host > template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -58,24 +54,20 @@ struct Reduction< Devices::Cuda > template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 93bb14e3e..28ddfaf26 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -41,13 +41,11 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25 template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Host >:: reduce( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -136,13 +134,11 @@ reduce( const Index size, template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > std::pair< Index, Result > Reduction< Devices::Host >:: reduceWithArgument( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -261,13 +257,11 @@ reduceWithArgument( const Index size, template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Cuda >:: reduce( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -313,7 +307,7 @@ reduce( const Index size, // finish the reduction on the host auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; - const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); + const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -339,13 +333,11 @@ reduce( const Index size, template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > std::pair< Index, Result > Reduction< Devices::Cuda >:: reduceWithArgument( const Index size, ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -395,7 +387,7 @@ reduceWithArgument( const Index size, // finish the reduction on the host // auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; -// const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero ); +// const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h index ff533e781..e65ec4958 100644 --- a/src/TNL/Containers/Expressions/Comparison.h +++ b/src/TNL/Containers/Expressions/Comparison.h @@ -66,8 +66,7 @@ struct VectorComparison< T1, T2, false > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; @@ -98,8 +97,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -113,8 +111,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -128,8 +125,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -143,8 +139,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; @@ -161,8 +156,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -177,8 +171,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -188,8 +181,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -199,8 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -210,8 +201,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } }; @@ -228,8 +218,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -244,8 +233,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -255,8 +243,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -266,8 +253,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -277,8 +263,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 405b0014a..70eb37cc8 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -2187,12 +2187,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2201,19 +2199,17 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2222,7 +2218,7 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2232,12 +2228,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2250,19 +2244,17 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2275,7 +2267,7 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2285,12 +2277,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2303,19 +2293,17 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2328,7 +2316,7 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } } // namespace TNL diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h index 4e569a45d..43940417b 100644 --- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h +++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h @@ -70,15 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) { - if( a > b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::max() ); + result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::max() ); result.first = gatheredResults[ result.first ].first; } return result; @@ -135,15 +127,7 @@ auto DistributedExpressionArgMax( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) { - if( a < b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::lowest() ); + result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::lowest() ); result.first = gatheredResults[ result.first ].first; } return result; diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h index 5a069b0b9..8816f153b 100644 --- a/src/TNL/Containers/Expressions/ExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h @@ -2110,12 +2110,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2124,19 +2122,17 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2145,7 +2141,7 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2155,12 +2151,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2173,19 +2167,17 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2198,7 +2190,7 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2208,12 +2200,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2226,19 +2216,17 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2251,7 +2239,7 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } } // namespace TNL diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h index cce425923..0a3e8a7fb 100644 --- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h @@ -2276,13 +2276,11 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); @@ -2295,13 +2293,11 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); @@ -2317,13 +2313,11 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); @@ -2339,13 +2333,11 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); @@ -2364,13 +2356,11 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); @@ -2386,13 +2376,11 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, Reduction& reduction, - VolatileReduction& volatileReduction, const Result& zero ) { Result result( zero ); diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h index a12505780..2470a7f22 100644 --- a/src/TNL/Containers/Expressions/VerticalOperations.h +++ b/src/TNL/Containers/Expressions/VerticalOperations.h @@ -33,8 +33,7 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a < b ? a : b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -53,15 +52,7 @@ auto ExpressionArgMin( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { - if( a > b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -72,8 +63,7 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a > b ? a : b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Expression > @@ -92,15 +82,7 @@ auto ExpressionArgMax( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { - if( a < b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Expression > @@ -111,8 +93,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } template< typename Expression > @@ -123,8 +104,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype( auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } template< typename Expression > @@ -135,8 +115,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype( auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } template< typename Expression, typename Real > @@ -147,8 +126,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } template< typename Expression > @@ -159,8 +137,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a *= b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 1 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 ); } template< typename Expression > @@ -171,8 +148,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a && b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -183,8 +159,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a || b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } template< typename Expression > @@ -195,8 +170,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a & b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -207,8 +181,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a | b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } } // namespace Expressions diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index d1ef7bc97..725ba3597 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -440,12 +440,10 @@ hauseholder_generate( const int i, const IndexType ldSize = this->ldSize; auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; }; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, reduction, - volatileReduction, size, i, aux ); @@ -552,12 +550,10 @@ hauseholder_cwy_transposed( VectorViewType z, const IndexType ldSize = this->ldSize; auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; }; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, reduction, - volatileReduction, size, i + 1, aux ); diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp index ccc98daa7..3506152ff 100644 --- a/src/TNL/Solvers/ODE/Euler.hpp +++ b/src/TNL/Solvers/ODE/Euler.hpp @@ -120,8 +120,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u ) } } auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; - this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); /**** * When time is close to stopTime the new residue diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index ce21298b0..55dbbdf4e 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -174,15 +174,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) time += currentTau; auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ), - reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); ///// // When time is close to stopTime the new residue // may be inaccurate significantly. if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); - if( ! this->nextIteration() ) return false; diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h index 29d2800f3..a876af2c3 100644 --- a/src/UnitTests/Containers/MultireductionTest.h +++ b/src/UnitTests/Containers/MultireductionTest.h @@ -115,12 +115,10 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect return _V[ i + k * size ] * _y[ i ]; }; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, reduction, - volatileReduction, size, n, result.getData() ); diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h index d8c26b93c..147f5946f 100644 --- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h +++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h @@ -38,8 +38,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w ) using RealType = typename VectorView::RealType; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return evaluateAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); } TYPED_TEST( VectorTest, evaluateAndReduce ) @@ -75,8 +74,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w ) using RealType = typename VectorView::RealType; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); } template< typename VectorView > @@ -86,8 +84,7 @@ performAddAndReduce2( VectorView& v, VectorView& w ) using RealType = typename VectorView::RealType; auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 ); } -- GitLab From 1777e488d4e00559b3b1a68a66b2ffd204f66f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sun, 11 Aug 2019 20:50:34 +0200 Subject: [PATCH 06/23] Removed ReductionOperations.h --- .../Algorithms/ArrayOperationsCuda.hpp | 1 - .../Algorithms/ArrayOperationsHost.hpp | 1 - .../Algorithms/ArrayOperationsMIC.hpp | 1 - .../Algorithms/CudaPrefixSumKernel.h | 1 - src/TNL/Containers/Algorithms/PrefixSum.hpp | 1 - .../Algorithms/ReductionOperations.h | 641 ------------------ src/TNL/Containers/DistributedVector.hpp | 1 - src/TNL/Containers/DistributedVectorView.hpp | 1 - 8 files changed, 648 deletions(-) delete mode 100644 src/TNL/Containers/Algorithms/ReductionOperations.h diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 120ef3801..3322a910e 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp index f7c164d06..3a54a862f 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp @@ -17,7 +17,6 @@ #include #include #include -#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp index 5bf04dbaf..4113bbcd9 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include namespace TNL { diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 40a662b08..0bfbe80ba 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -15,7 +15,6 @@ #include #include #include -#include #include namespace TNL { diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index d3c3b3071..1ffd4cf4e 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include diff --git a/src/TNL/Containers/Algorithms/ReductionOperations.h b/src/TNL/Containers/Algorithms/ReductionOperations.h deleted file mode 100644 index 33ef84b1c..000000000 --- a/src/TNL/Containers/Algorithms/ReductionOperations.h +++ /dev/null @@ -1,641 +0,0 @@ -/*************************************************************************** - ReductionOperations.h - description - ------------------- - begin : Mar 22, 2013 - copyright : (C) 2013 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include // std::numeric_limits - -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -/* - * Unary operations: reduction on one input vector. - */ - -template< typename Data, typename Result = Data > -class ParallelReductionSum -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const ResultType& data ) - { - result += data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const ResultType& data ) - { - result += data; - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionMin -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, data1[ index ] ); - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = TNL::min( result, data ); - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = TNL::min( result, data ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionMax -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, data1[ index ] ); - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = TNL::max( result, data ); - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = TNL::max( result, data ); - } -}; - -template< typename Data, typename Result = bool > -class ParallelReductionLogicalAnd -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return true; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = result && data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = result && data; - } -}; - - -template< typename Data, typename Result = bool > -class ParallelReductionLogicalOr -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalOr< Result >; - - static constexpr Result initialValue() { return false; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result || data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = result || data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = result || data; - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsSum : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::abs( data1[ index ] ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsMin : public ParallelReductionMin< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, TNL::abs( data1[ index ] ) ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsMax : public ParallelReductionMax< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, TNL::abs( data1[ index ] ) ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionL2Norm : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - const Data& aux = data1[ index ]; - result += aux * aux; - } -}; - - -template< typename Data, typename Result = Data, typename PType = Data > -class ParallelReductionLpNorm : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - void setPower( const PType p ) - { - this->p = p; - } - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::pow( TNL::abs( data1[ index ] ), p ); - } - -protected: - PType p; -}; - - -/* - * Binary operations: reduction on two input vectors. - */ - -template< typename Data1, typename Data2, typename Result = bool > -class ParallelReductionEqualities : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return true; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] == data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = bool > -class ParallelReductionInequalities : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return false; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] != data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionScalarProduct : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ] * data2[ index ]; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffSum : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ] - data2[ index ]; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffMin : public ParallelReductionMin< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffMax : public ParallelReductionMax< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsSum : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::abs( data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsMin : public ParallelReductionMin< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsMax : public ParallelReductionMax< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffL2Norm : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - const ResultType aux = data2[ index ] - data1[ index ]; - result += aux * aux; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1, typename PType = Data1 > -class ParallelReductionDiffLpNorm : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - void setPower( const PType p ) - { - this->p = p; - } - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p ); - } - -protected: - PType p; -}; - -template< typename Data, typename Result = bool > -class ParallelReductionContainsValue : public ParallelReductionLogicalOr< Result > -{ -public: - using DataType1 = Data; - using DataType2 = Data; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalOr< Result >; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result || ( data1[ index ] == value ); - } - - void setValue( const Data& v ) - { - this->value = v; - } - -protected: - Data value; -}; - -template< typename Data, typename Result = bool > -class ParallelReductionContainsOnlyValue : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data; - using DataType2 = Data; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] == value ); - } - - void setValue( const Data& v ) - { - this->value = v; - } - -protected: - Data value; -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index 73781c906..f50624c61 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -13,7 +13,6 @@ #pragma once #include "DistributedVector.h" -#include #include namespace TNL { diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 38715161c..2f1e4666e 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -13,7 +13,6 @@ #pragma once #include "DistributedVectorView.h" -#include #include namespace TNL { -- GitLab From d0fc1bb7c6bd3d4fc9a6eeb34685b9aedec30414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 12 Aug 2019 09:07:35 +0200 Subject: [PATCH 07/23] Removed VectorOperations class which is now useless It contained only methods for prefixSum and segmentedPrefixSum, which were identical for Host and Cuda, so they can be easily implemented directly in Vector and VectorView. --- src/Benchmarks/BLAS/CommonVectorOperations.h | 2 - .../BLAS/CommonVectorOperations.hpp | 1 - src/TNL/Containers/Algorithms/PrefixSum.h | 11 ++-- src/TNL/Containers/Algorithms/PrefixSumType.h | 24 ------- .../Containers/Algorithms/VectorAssignment.h | 1 - .../Containers/Algorithms/VectorOperations.h | 64 ------------------- .../Algorithms/VectorOperationsCuda_impl.h | 58 ----------------- .../Algorithms/VectorOperationsHost_impl.h | 55 ---------------- src/TNL/Containers/Vector.hpp | 10 ++- src/TNL/Containers/VectorView.h | 2 +- src/TNL/Containers/VectorView.hpp | 12 ++-- .../Containers/VectorBinaryOperationsTest.h | 1 - .../Containers/VectorEvaluateAndReduceTest.h | 1 - .../Containers/VectorPrefixSumTest.h | 30 ++++----- src/UnitTests/Containers/VectorTestSetup.h | 4 -- .../Containers/VectorUnaryOperationsTest.h | 1 - .../Containers/VectorVerticalOperationsTest.h | 1 - 17 files changed, 38 insertions(+), 240 deletions(-) delete mode 100644 src/TNL/Containers/Algorithms/PrefixSumType.h delete mode 100644 src/TNL/Containers/Algorithms/VectorOperations.h delete mode 100644 src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h delete mode 100644 src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.h b/src/Benchmarks/BLAS/CommonVectorOperations.h index 504da8fa6..ed2915368 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.h +++ b/src/Benchmarks/BLAS/CommonVectorOperations.h @@ -10,8 +10,6 @@ #pragma once -#include - namespace TNL { namespace Benchmarks { diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp index b665de0ea..033e99301 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp +++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp @@ -11,7 +11,6 @@ #pragma once #include -#include #include "CommonVectorOperations.h" namespace TNL { diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h index 715d6f1b9..53d6d7f83 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.h +++ b/src/TNL/Containers/Algorithms/PrefixSum.h @@ -14,20 +14,23 @@ #include #include -#include -#include namespace TNL { namespace Containers { namespace Algorithms { +enum class PrefixSumType { + Exclusive, + Inclusive +}; + template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class PrefixSum {}; +class PrefixSum; template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class SegmentedPrefixSum {}; +class SegmentedPrefixSum; template< PrefixSumType Type > diff --git a/src/TNL/Containers/Algorithms/PrefixSumType.h b/src/TNL/Containers/Algorithms/PrefixSumType.h deleted file mode 100644 index ba03adfe6..000000000 --- a/src/TNL/Containers/Algorithms/PrefixSumType.h +++ /dev/null @@ -1,24 +0,0 @@ -/*************************************************************************** - PrefixSumType.h - description - ------------------- - begin : Jun 6, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -namespace TNL { -namespace Containers { -namespace Algorithms { - -enum class PrefixSumType { - Exclusive, - Inclusive -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/VectorAssignment.h b/src/TNL/Containers/Algorithms/VectorAssignment.h index 36829be82..c861579f4 100644 --- a/src/TNL/Containers/Algorithms/VectorAssignment.h +++ b/src/TNL/Containers/Algorithms/VectorAssignment.h @@ -12,7 +12,6 @@ #include #include -#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h deleted file mode 100644 index a12b310e3..000000000 --- a/src/TNL/Containers/Algorithms/VectorOperations.h +++ /dev/null @@ -1,64 +0,0 @@ -/*************************************************************************** - VectorOperations.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< typename Device > -class VectorOperations{}; - -template<> -class VectorOperations< Devices::Host > -{ -public: - template< Algorithms::PrefixSumType Type, - typename Vector > - static void prefixSum( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); - - template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > - static void segmentedPrefixSum( Vector& v, - Flags& f, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); -}; - -template<> -class VectorOperations< Devices::Cuda > -{ -public: - template< Algorithms::PrefixSumType Type, - typename Vector > - static void prefixSum( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); - - template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > - static void segmentedPrefixSum( Vector& v, - Flags& f, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL - -#include -#include diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h deleted file mode 100644 index 4b53dbcf7..000000000 --- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +++ /dev/null @@ -1,58 +0,0 @@ -/*************************************************************************** - VectorOperationsCuda_impl.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< Algorithms::PrefixSumType Type, - typename Vector > -void -VectorOperations< Devices::Cuda >:: -prefixSum( Vector& v, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - PrefixSum< Devices::Cuda, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > -void -VectorOperations< Devices::Cuda >:: -segmentedPrefixSum( Vector& v, - Flags& f, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - SegmentedPrefixSum< Devices::Cuda, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h deleted file mode 100644 index 50d591b9f..000000000 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ /dev/null @@ -1,55 +0,0 @@ -/*************************************************************************** - VectorOperationsHost_impl.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< Algorithms::PrefixSumType Type, typename Vector > -void -VectorOperations< Devices::Host >:: -prefixSum( Vector& v, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - PrefixSum< Devices::Host, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > -void -VectorOperations< Devices::Host >:: -segmentedPrefixSum( Vector& v, - Flags& f, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - SegmentedPrefixSum< Devices::Host, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp index 7338c8317..525ba6380 100644 --- a/src/TNL/Containers/Vector.hpp +++ b/src/TNL/Containers/Vector.hpp @@ -175,7 +175,10 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end ); + + auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); } template< typename Real, @@ -190,7 +193,10 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end ); + + auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); } template< typename Real, diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h index 1662afb7c..0d0395454 100644 --- a/src/TNL/Containers/VectorView.h +++ b/src/TNL/Containers/VectorView.h @@ -14,7 +14,7 @@ #include #include -#include +#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp index 986d2bc0f..057d402d7 100644 --- a/src/TNL/Containers/VectorView.hpp +++ b/src/TNL/Containers/VectorView.hpp @@ -11,8 +11,6 @@ #pragma once #include -#include -#include #include #include @@ -127,7 +125,10 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end ); + + auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); } template< typename Real, @@ -141,7 +142,10 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end ); + + auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); } template< typename Real, diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index 8fae76884..93283483c 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace binary_tests { diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h index 147f5946f..7c68c86f7 100644 --- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h +++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h @@ -22,7 +22,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; using namespace TNL::Arithmetics; // should be small enough to have fast tests, but larger than minGPUReductionDataSize diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h index 733013933..aba569df7 100644 --- a/src/UnitTests/Containers/VectorPrefixSumTest.h +++ b/src/UnitTests/Containers/VectorPrefixSumTest.h @@ -20,7 +20,6 @@ constexpr int VECTOR_TEST_SIZE = 10000; TYPED_TEST( VectorTest, prefixSum ) { using VectorType = typename TestFixture::VectorType; - using VectorOperations = typename TestFixture::VectorOperations; using ViewType = typename TestFixture::ViewType; using RealType = typename VectorType::RealType; using DeviceType = typename VectorType::DeviceType; @@ -79,7 +78,7 @@ TYPED_TEST( VectorTest, prefixSum ) v = 0; v_host = -1; v.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -87,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); @@ -95,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i + 1 ); @@ -103,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum ) v = 0; v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -111,11 +110,11 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); #endif } } @@ -123,7 +122,6 @@ TYPED_TEST( VectorTest, prefixSum ) TYPED_TEST( VectorTest, exclusivePrefixSum ) { using VectorType = typename TestFixture::VectorType; - using VectorOperations = typename TestFixture::VectorOperations; using ViewType = typename TestFixture::ViewType; using RealType = typename VectorType::RealType; using DeviceType = typename VectorType::DeviceType; @@ -186,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -199,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -207,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); @@ -215,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -223,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -231,11 +229,11 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); #endif } } diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h index 7338f9637..5c342dced 100644 --- a/src/UnitTests/Containers/VectorTestSetup.h +++ b/src/UnitTests/Containers/VectorTestSetup.h @@ -8,8 +8,6 @@ /* See Copyright Notice in tnl/Copyright */ -// NOTE: Vector = Array + VectorOperations, so we test Vector and VectorOperations at the same time - #pragma once #ifdef HAVE_GTEST @@ -24,7 +22,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; using namespace TNL::Arithmetics; // test fixture for typed tests @@ -33,7 +30,6 @@ class VectorTest : public ::testing::Test { protected: using VectorType = Vector; - using VectorOperations = Algorithms::VectorOperations< typename VectorType::DeviceType >; using ViewType = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >; }; diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index a0aa2695f..122404253 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace unary_tests { diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index 758ee1d50..93ff286ae 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace vertical_tests { -- GitLab From e20a09309a4a02fc387e4ae8b80b435aa4e79935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 12 Aug 2019 12:48:06 +0200 Subject: [PATCH 08/23] Changed reduction operation to use functions with `return a + b` instead of `a += b` This is nicer because it more clearly separates data load, computation and data store. Furthermore, it allows to use instances of std::plus, std::logical_and, std::logical_or, etc. instead of custom lambda functions. --- .../BLAS/CommonVectorOperations.hpp | 56 +++++++++---------- .../Algorithms/ArrayOperationsCuda.hpp | 12 ++-- .../Algorithms/CudaMultireductionKernel.h | 34 +++++------ .../Algorithms/CudaReductionKernel.h | 34 +++++------ .../Containers/Algorithms/Multireduction.hpp | 38 ++++++------- src/TNL/Containers/Algorithms/Reduction.hpp | 37 ++++++------ src/TNL/Containers/Expressions/Comparison.h | 30 +++++----- .../DistributedExpressionTemplates.h | 12 ++-- .../Expressions/ExpressionTemplates.h | 12 ++-- .../Expressions/StaticExpressionTemplates.h | 24 ++++---- .../Expressions/VerticalOperations.h | 26 ++++----- src/TNL/Solvers/Linear/GMRES_impl.h | 4 +- src/TNL/Solvers/ODE/Euler.hpp | 2 +- src/TNL/Solvers/ODE/Merson_impl.h | 2 +- src/UnitTests/Containers/MultireductionTest.h | 2 +- .../Containers/VectorEvaluateAndReduceTest.h | 6 +- 16 files changed, 165 insertions(+), 166 deletions(-) diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp index 033e99301..cc6d0baad 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp +++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp @@ -29,7 +29,7 @@ getVectorMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } @@ -46,7 +46,7 @@ getVectorMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -63,7 +63,7 @@ getVectorAbsMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } @@ -80,7 +80,7 @@ getVectorAbsMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } @@ -113,8 +113,8 @@ getVectorL2Norm( const Vector& v ) using IndexType = typename Vector::IndexType; const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } @@ -137,8 +137,8 @@ getVectorLpNorm( const Vector& v, return getVectorL2Norm< Vector, ResultType >( v ); const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } @@ -157,8 +157,8 @@ getVectorSum( const Vector& v ) using IndexType = typename Vector::IndexType; const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } @@ -177,8 +177,8 @@ getVectorDifferenceMax( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } @@ -197,8 +197,8 @@ getVectorDifferenceMin( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -217,8 +217,8 @@ getVectorDifferenceAbsMax( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } @@ -237,8 +237,8 @@ getVectorDifferenceAbsMin( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -257,8 +257,8 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } @@ -281,7 +281,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, auto diff = data1[ i ] - data2[ i ]; return diff * diff; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } @@ -307,8 +307,8 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } @@ -327,8 +327,8 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } @@ -347,8 +347,8 @@ getScalarProduct( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 3322a910e..4a75c6fff 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -132,8 +132,8 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } @@ -149,8 +149,8 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, (Index) 0, "" ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a || b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } @@ -166,8 +166,8 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index 782e66eaa..e67c11b41 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -67,19 +67,19 @@ CudaMultireductionKernel( const Result zero, // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSizeX < size ) { - reduction( sdata[ tid ], dataFetcher( gid, y ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); - reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) ); - reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) ); gid += 4 * gridSizeX; } while( gid + 2 * gridSizeX < size ) { - reduction( sdata[ tid ], dataFetcher( gid, y ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); gid += 2 * gridSizeX; } while( gid < size ) { - reduction( sdata[ tid ], dataFetcher( gid, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); gid += gridSizeX; } __syncthreads(); @@ -87,48 +87,48 @@ CudaMultireductionKernel( const Result zero, // Perform the parallel reduction. if( blockSizeX >= 1024 ) { if( threadIdx.x < 512 ) - reduction( sdata[ tid ], sdata[ tid + 512 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSizeX >= 512 ) { if( threadIdx.x < 256 ) - reduction( sdata[ tid ], sdata[ tid + 256 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSizeX >= 256 ) { if( threadIdx.x < 128 ) - reduction( sdata[ tid ], sdata[ tid + 128 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSizeX >= 128 ) { if( threadIdx.x < 64 ) - reduction( sdata[ tid ], sdata[ tid + 64 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( threadIdx.x < 32 ) { if( blockSizeX >= 64 ) - reduction( sdata[ tid ], sdata[ tid + 32 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSizeX >= 32 ) - reduction( sdata[ tid ], sdata[ tid + 16 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSizeX >= 16 ) - reduction( sdata[ tid ], sdata[ tid + 8 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSizeX >= 8 ) - reduction( sdata[ tid ], sdata[ tid + 4 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSizeX >= 4 ) - reduction( sdata[ tid ], sdata[ tid + 2 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSizeX >= 2 ) - reduction( sdata[ tid ], sdata[ tid + 1 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 03da1e556..ce278c7f1 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -63,19 +63,19 @@ CudaReductionKernel( const Result zero, // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); - reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); - reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); @@ -83,48 +83,48 @@ CudaReductionKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) - reduction( sdata[ tid ], sdata[ tid + 512 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - reduction( sdata[ tid ], sdata[ tid + 256 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - reduction( sdata[ tid ], sdata[ tid + 128 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) - reduction( sdata[ tid ], sdata[ tid + 64 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { if( blockSize >= 64 ) - reduction( sdata[ tid ], sdata[ tid + 32 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) - reduction( sdata[ tid ], sdata[ tid + 16 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSize >= 16 ) - reduction( sdata[ tid ], sdata[ tid + 8 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSize >= 8 ) - reduction( sdata[ tid ], sdata[ tid + 4 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSize >= 4 ) - reduction( sdata[ tid ], sdata[ tid + 2 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSize >= 2 ) - reduction( sdata[ tid ], sdata[ tid + 1 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp index 2f1999303..4044884cb 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.hpp +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -72,10 +72,10 @@ reduce( const Result zero, for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; for( int i = 0; i < block_size; i += 4 ) { - reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); - reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); - reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); - reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); } } } @@ -86,23 +86,23 @@ reduce( const Result zero, for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; for( Index i = blocks * block_size; i < size; i++ ) - reduction( _r[ 0 ], dataFetcher( i, k ) ); + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); } } // local reduction of unrolled results for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; - reduction( _r[ 0 ], _r[ 1 ] ); - reduction( _r[ 0 ], _r[ 2 ] ); - reduction( _r[ 0 ], _r[ 3 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); } // inter-thread reduction of local results #pragma omp critical { for( int k = 0; k < n; k++ ) - reduction( result[ k ], r[ 4 * k ] ); + result[ k ] = reduction( result[ k ], r[ 4 * k ] ); } } else { @@ -120,10 +120,10 @@ reduce( const Result zero, for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; for( int i = 0; i < block_size; i += 4 ) { - reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); - reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); - reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); - reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); } } } @@ -132,15 +132,15 @@ reduce( const Result zero, for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; for( Index i = blocks * block_size; i < size; i++ ) - reduction( _r[ 0 ], dataFetcher( i, k ) ); + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); } // reduction of unrolled results for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; - reduction( _r[ 0 ], _r[ 1 ] ); - reduction( _r[ 0 ], _r[ 2 ] ); - reduction( _r[ 0 ], _r[ 3 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); // copy the result into the output parameter result[ k ] = _r[ 0 ]; @@ -154,13 +154,13 @@ reduce( const Result zero, const Index offset = b * block_size; for( int k = 0; k < n; k++ ) { for( int i = 0; i < block_size; i++ ) - reduction( result[ k ], dataFetcher( offset + i, k ) ); + result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) ); } } for( int k = 0; k < n; k++ ) { for( Index i = blocks * block_size; i < size; i++ ) - reduction( result[ k ], dataFetcher( i, k ) ); + result[ k ] = reduction( result[ k ], dataFetcher( i, k ) ); } } #ifdef HAVE_OPENMP diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 28ddfaf26..19c84b9dd 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -65,10 +65,10 @@ reduce( const Index size, for( int b = 0; b < blocks; b++ ) { const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - reduction( r[ 0 ], dataFetcher( offset + i ) ); - reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } @@ -76,18 +76,18 @@ reduce( const Index size, #pragma omp single nowait { for( Index i = blocks * block_size; i < size; i++ ) - reduction( r[ 0 ], dataFetcher( i ) ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); } // local reduction of unrolled results - reduction( r[ 0 ], r[ 2 ] ); - reduction( r[ 1 ], r[ 3 ] ); - reduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - reduction( result, r[ 0 ] ); + result = reduction( result, r[ 0 ] ); } } return result; @@ -102,28 +102,27 @@ reduce( const Index size, for( int b = 0; b < blocks; b++ ) { const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - reduction( r[ 0 ], dataFetcher( offset + i ) ); - reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // reduction of the last, incomplete block (not unrolled) for( Index i = blocks * block_size; i < size; i++ ) - reduction( r[ 0 ], dataFetcher( i ) ); - //operation.dataFetcher( r[ 0 ], i, input1, input2 ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); // reduction of unrolled results - reduction( r[ 0 ], r[ 2 ] ); - reduction( r[ 1 ], r[ 3 ] ); - reduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); return r[ 0 ]; } else { Result result = zero; for( Index i = 0; i < size; i++ ) - reduction( result, dataFetcher( i ) ); + result = reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h index e65ec4958..a84b17fec 100644 --- a/src/TNL/Containers/Expressions/Comparison.h +++ b/src/TNL/Containers/Expressions/Comparison.h @@ -65,7 +65,7 @@ struct VectorComparison< T1, T2, false > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; @@ -96,7 +96,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -110,7 +110,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -124,7 +124,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -138,7 +138,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; @@ -155,7 +155,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } @@ -170,7 +170,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } @@ -180,7 +180,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } @@ -190,7 +190,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } @@ -200,7 +200,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); } }; @@ -217,7 +217,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -232,7 +232,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -242,7 +242,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -252,7 +252,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } @@ -262,7 +262,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); } }; diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 70eb37cc8..1d3077b43 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -2190,7 +2190,7 @@ template< typename Vector, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2209,7 +2209,7 @@ template< typename Vector, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2231,7 +2231,7 @@ template< typename Vector, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2254,7 +2254,7 @@ template< typename Vector, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2280,7 +2280,7 @@ template< typename Vector, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2303,7 +2303,7 @@ template< typename Vector, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h index 8816f153b..f01d8a68e 100644 --- a/src/TNL/Containers/Expressions/ExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h @@ -2113,7 +2113,7 @@ template< typename Vector, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2132,7 +2132,7 @@ template< typename Vector, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2154,7 +2154,7 @@ template< typename Vector, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2177,7 +2177,7 @@ template< typename Vector, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2203,7 +2203,7 @@ template< typename Vector, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2226,7 +2226,7 @@ template< typename Vector, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h index 0a3e8a7fb..1bafe7cfc 100644 --- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h @@ -2280,12 +2280,12 @@ template< typename Vector, __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) - reduction( result, lhs[ i ] = expression[ i ] ); + result = reduction( result, lhs[ i ] = expression[ i ] ); return result; } @@ -2297,12 +2297,12 @@ template< typename Vector, __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) - reduction( result, lhs[ i ] = expression[ i ] ); + result = reduction( result, lhs[ i ] = expression[ i ] ); return result; } @@ -2317,14 +2317,14 @@ template< typename Vector, __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, aux ); + result = reduction( result, aux ); } return result; } @@ -2337,14 +2337,14 @@ template< typename Vector, __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, aux ); + result = reduction( result, aux ); } return result; } @@ -2360,14 +2360,14 @@ template< typename Vector, __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, TNL::abs( aux ) ); + result = reduction( result, TNL::abs( aux ) ); } return result; } @@ -2380,14 +2380,14 @@ template< typename Vector, __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, TNL::abs( aux ) ); + result = reduction( result, TNL::abs( aux ) ); } return result; } diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h index 2470a7f22..29f97c985 100644 --- a/src/TNL/Containers/Expressions/VerticalOperations.h +++ b/src/TNL/Containers/Expressions/VerticalOperations.h @@ -32,7 +32,7 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -44,7 +44,7 @@ auto ExpressionArgMin( const Expression& expression ) using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { + auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { if( a > b ) { a = b; aIdx = bIdx; @@ -62,7 +62,7 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } @@ -74,7 +74,7 @@ auto ExpressionArgMax( const Expression& expression ) using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { + auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { if( a < b ) { a = b; aIdx = bIdx; @@ -92,7 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } @@ -103,7 +103,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } @@ -114,7 +114,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } @@ -125,7 +125,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } @@ -136,7 +136,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a * b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 ); } @@ -147,7 +147,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a && b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -158,7 +158,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a || b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } @@ -169,7 +169,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a & b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } @@ -180,7 +180,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a | b; }; return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); } diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index 725ba3597..cf87ea2b2 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -439,7 +439,7 @@ hauseholder_generate( const int i, const RealType* _y_i = Traits::getConstLocalView( y_i ).getData(); const IndexType ldSize = this->ldSize; auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; }; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, @@ -549,7 +549,7 @@ hauseholder_cwy_transposed( VectorViewType z, const RealType* _w = Traits::getConstLocalView( w ).getData(); const IndexType ldSize = this->ldSize; auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; }; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp index 3506152ff..a4ff45fef 100644 --- a/src/TNL/Solvers/ODE/Euler.hpp +++ b/src/TNL/Solvers/ODE/Euler.hpp @@ -119,7 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u ) continue; } } - auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); /**** diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 55dbbdf4e..67fd5e101 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -173,7 +173,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) RealType lastResidue = this->getResidue(); time += currentTau; - auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ), reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h index a876af2c3..aeda3eeff 100644 --- a/src/UnitTests/Containers/MultireductionTest.h +++ b/src/UnitTests/Containers/MultireductionTest.h @@ -114,7 +114,7 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" ); return _V[ i + k * size ] * _y[ i ]; }; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h index 7c68c86f7..3ff2d932f 100644 --- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h +++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h @@ -36,7 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); } @@ -72,7 +72,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); } @@ -82,7 +82,7 @@ performAddAndReduce2( VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; + auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 ); } -- GitLab From 0a57393f55caafab2c8b271cf98536700c30b209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 12 Aug 2019 13:58:21 +0200 Subject: [PATCH 09/23] Replaced custom lambda functions with instances of STL types where possible --- .../BLAS/CommonVectorOperations.hpp | 27 ++++------- .../Algorithms/ArrayOperationsCuda.hpp | 9 ++-- .../Containers/Algorithms/Multireduction.h | 6 +++ src/TNL/Containers/Algorithms/Reduction.h | 9 ++-- src/TNL/Containers/Algorithms/Reduction.hpp | 8 ++-- src/TNL/Containers/Expressions/Comparison.h | 45 +++++++------------ .../Expressions/VerticalOperations.h | 27 ++++------- src/TNL/Solvers/Linear/GMRES_impl.h | 6 +-- src/TNL/Solvers/ODE/Euler.hpp | 3 +- src/TNL/Solvers/ODE/Merson_impl.h | 3 +- src/UnitTests/Containers/MultireductionTest.h | 3 +- .../Containers/VectorEvaluateAndReduceTest.h | 9 ++-- 12 files changed, 59 insertions(+), 96 deletions(-) diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp index cc6d0baad..640fda337 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp +++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp @@ -97,8 +97,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -114,8 +113,7 @@ getVectorL2Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) ); + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -138,8 +136,7 @@ getVectorLpNorm( const Vector& v, const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -158,8 +155,7 @@ getVectorSum( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -258,8 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -281,8 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, auto diff = data1[ i ] - data2[ i ]; return diff * diff; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) ); + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -308,8 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -328,8 +321,7 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -348,8 +340,7 @@ getScalarProduct( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } } // namespace Benchmarks diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 4a75c6fff..46618fcd7 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -133,8 +133,7 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true ); } template< typename Element, @@ -150,8 +149,7 @@ containsValue( const Element* data, TNL_ASSERT_GE( size, (Index) 0, "" ); auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a || b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); + return Reduction< Devices::Cuda >::reduce( size, std::logical_or<>{}, fetch, false ); } template< typename Element, @@ -167,8 +165,7 @@ containsOnlyValue( const Element* data, TNL_ASSERT_GE( size, 0, "" ); auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); + return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h index 12ed22003..9802a2953 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.h +++ b/src/TNL/Containers/Algorithms/Multireduction.h @@ -12,6 +12,8 @@ #pragma once +#include // reduction functions like std::plus, std::logical_and, std::logical_or etc. + #include #include @@ -32,6 +34,8 @@ struct Multireduction< Devices::Host > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation + * for example, it can be an instance of std::plus, std::logical_and, + * std::logical_or etc. * size: the size of each dataset * n: number of datasets to be reduced * result: output array of size = n @@ -59,6 +63,8 @@ struct Multireduction< Devices::Cuda > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation + * for example, it can be an instance of std::plus, std::logical_and, + * std::logical_or etc. * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index d4406332b..41b000221 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -13,6 +13,7 @@ #pragma once #include // std::pair +#include // reduction functions like std::plus, std::logical_and, std::logical_or etc. #include #include @@ -33,7 +34,7 @@ struct Reduction< Devices::Host > typename DataFetcher > static Result reduce( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); @@ -43,7 +44,7 @@ struct Reduction< Devices::Host > typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -57,7 +58,7 @@ struct Reduction< Devices::Cuda > typename DataFetcher > static Result reduce( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); @@ -67,7 +68,7 @@ struct Reduction< Devices::Cuda > typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 19c84b9dd..c33065d19 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -45,7 +45,7 @@ template< typename Index, Result Reduction< Devices::Host >:: reduce( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -137,7 +137,7 @@ template< typename Index, std::pair< Index, Result > Reduction< Devices::Host >:: reduceWithArgument( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -260,7 +260,7 @@ template< typename Index, Result Reduction< Devices::Cuda >:: reduce( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -336,7 +336,7 @@ template< typename Index, std::pair< Index, Result > Reduction< Devices::Cuda >:: reduceWithArgument( const Index size, - ReductionOperation& reduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h index a84b17fec..616ad5807 100644 --- a/src/TNL/Containers/Expressions/Comparison.h +++ b/src/TNL/Containers/Expressions/Comparison.h @@ -65,8 +65,7 @@ struct VectorComparison< T1, T2, false > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -96,8 +95,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -110,8 +108,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -124,8 +121,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -138,8 +134,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -155,8 +150,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -170,8 +164,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -180,8 +173,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -190,8 +182,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -200,8 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -217,8 +207,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -232,8 +221,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -242,8 +230,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -252,8 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -262,8 +248,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; }; - auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h index 29f97c985..29e904bbf 100644 --- a/src/TNL/Containers/Expressions/VerticalOperations.h +++ b/src/TNL/Containers/Expressions/VerticalOperations.h @@ -92,8 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -103,8 +102,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -114,8 +112,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression, typename Real > @@ -125,8 +122,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -136,8 +132,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a * b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::multiplies<>{}, fetch, (ResultType) 1 ); } template< typename Expression > @@ -147,8 +142,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a && b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_and<>{}, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -158,8 +152,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a || b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_or<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -169,8 +162,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a & b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_and<>{}, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -180,8 +172,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a | b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_or<>{}, fetch, (ResultType) 0 ); } } // namespace Expressions diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index cf87ea2b2..7f316d730 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -439,11 +439,10 @@ hauseholder_generate( const int i, const RealType* _y_i = Traits::getConstLocalView( y_i ).getData(); const IndexType ldSize = this->ldSize; auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; }; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, - reduction, + std::plus<>{}, size, i, aux ); @@ -549,11 +548,10 @@ hauseholder_cwy_transposed( VectorViewType z, const RealType* _w = Traits::getConstLocalView( w ).getData(); const IndexType ldSize = this->ldSize; auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; }; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Containers::Algorithms::Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, - reduction, + std::plus<>{}, size, i + 1, aux ); diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp index a4ff45fef..12da6439b 100644 --- a/src/TNL/Solvers/ODE/Euler.hpp +++ b/src/TNL/Solvers/ODE/Euler.hpp @@ -119,8 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u ) continue; } } - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; - this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + this->setResidue( addAndReduceAbs( u, currentTau * k1, std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); /**** * When time is close to stopTime the new residue diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index 67fd5e101..3c88576e9 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -173,9 +173,8 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) RealType lastResidue = this->getResidue(); time += currentTau; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ), - reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); ///// // When time is close to stopTime the new residue diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h index aeda3eeff..7a321f583 100644 --- a/src/UnitTests/Containers/MultireductionTest.h +++ b/src/UnitTests/Containers/MultireductionTest.h @@ -114,11 +114,10 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" ); return _V[ i + k * size ] * _y[ i ]; }; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; Multireduction< DeviceType >::reduce ( (RealType) 0, fetch, - reduction, + std::plus<>{}, size, n, result.getData() ); diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h index 3ff2d932f..8c0cd7f90 100644 --- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h +++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h @@ -36,8 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; - return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); + return evaluateAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 ); } TYPED_TEST( VectorTest, evaluateAndReduce ) @@ -72,8 +71,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; - return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 ); + return addAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 ); } template< typename VectorView > @@ -82,8 +80,7 @@ performAddAndReduce2( VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; }; - return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 ); + return addAndReduce( w, 5.0 * v, std::plus<>{}, ( RealType ) 0.0 ); } -- GitLab From 32c69a1143406a8e11613273e3014e57dc4362f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 12 Aug 2019 17:46:53 +0200 Subject: [PATCH 10/23] Ugly workaround for nvcc's stupid modification of `new` expressions --- .../tnl-benchmark-simple-heat-equation.h | 11 ++++- src/TNL/Containers/Algorithms/Reduction.hpp | 49 +++++++++++++++++-- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h index 33dff1ded..944e7f73f 100644 --- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h +++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h @@ -259,12 +259,21 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters, const Index dofsCount = gridXSize * gridYSize; dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) ); dim3 cudaUpdateBlockSize( 256 ); - + /**** * Initiation */ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out some warnings... + #ifdef __NVCC__ + Real* u = new Real[ static_cast(dofsCount) ]; + Real* aux = new Real[ static_cast(dofsCount) ]; + #else Real* u = new Real[ dofsCount ]; Real* aux = new Real[ dofsCount ]; + #endif Real* max_du = new Real[ cudaUpdateBlocks.x ]; if( ! u || ! aux ) { diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index c33065d19..20265c6b0 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -279,7 +279,7 @@ reduce( const Index size, // start the reduction on the GPU Result* deviceAux1( 0 ); - Index reducedSize = reductionLauncher.start( + const int reducedSize = reductionLauncher.start( reduction, dataFetcher, zero, @@ -294,7 +294,20 @@ reduce( const Index size, if( can_reduce_later_on_host ) { // transfer the reduced data from device to host - std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; + std::unique_ptr< Result[] > resultArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Result[ static_cast(reducedSize) ] + #else + new Result[ reducedSize ] + #endif + }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -356,7 +369,7 @@ reduceWithArgument( const Index size, // start the reduction on the GPU Result* deviceAux1( nullptr ); Index* deviceIndexes( nullptr ); - Index reducedSize = reductionLauncher.startWithArgument( + const int reducedSize = reductionLauncher.startWithArgument( reduction, dataFetcher, zero, @@ -372,8 +385,34 @@ reduceWithArgument( const Index size, if( can_reduce_later_on_host ) { // transfer the reduced data from device to host - std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] }; - std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] }; + std::unique_ptr< Result[] > resultArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Result[ static_cast(reducedSize) ] + #else + new Result[ reducedSize ] + #endif + }; + std::unique_ptr< Index[] > indexArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Index[ static_cast(reducedSize) ] + #else + new Index[ reducedSize ] + #endif + }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); -- GitLab From 505d0b68c3d445db50c6b7d1e2cc20fd598d6a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 18:40:50 +0200 Subject: [PATCH 11/23] Optimized OpenMP thread counts for reduction and multireduction --- .../Containers/Algorithms/Multireduction.hpp | 88 ++++++++++--------- src/TNL/Containers/Algorithms/Reduction.hpp | 10 ++- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp index 4044884cb..f4407e620 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.hpp +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -50,59 +50,61 @@ reduce( const Result zero, const int blocks = size / block_size; #ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) -#pragma omp parallel - { - // first thread initializes the result array - #pragma omp single nowait + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) { - for( int k = 0; k < n; k++ ) - result[ k ] = zero; - } + // first thread initializes the result array + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + } - // initialize array for thread-local results - // (it is accessed as a row-major matrix with n rows and 4 columns) - Result r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = zero; + // initialize array for thread-local results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; - #pragma omp for nowait - for( int b = 0; b < blocks; b++ ) { - const Index offset = b * block_size; - for( int k = 0; k < n; k++ ) { - Result* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); - _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); - _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); - _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + #pragma omp for nowait + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } } } - } - // the first thread that reaches here processes the last, incomplete block - #pragma omp single nowait - { + // the first thread that reaches here processes the last, incomplete block + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + } + + // local reduction of unrolled results for( int k = 0; k < n; k++ ) { Result* _r = r + 4 * k; - for( Index i = blocks * block_size; i < size; i++ ) - _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); } - } - // local reduction of unrolled results - for( int k = 0; k < n; k++ ) { - Result* _r = r + 4 * k; - _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); - _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); - _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); - } - - // inter-thread reduction of local results - #pragma omp critical - { - for( int k = 0; k < n; k++ ) - result[ k ] = reduction( result[ k ], r[ 4 * k ] ); + // inter-thread reduction of local results + #pragma omp critical + { + for( int k = 0; k < n; k++ ) + result[ k ] = reduction( result[ k ], r[ 4 * k ] ); + } } } else { diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 20265c6b0..229af1379 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -53,10 +53,11 @@ reduce( const Index size, const int blocks = size / block_size; #ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable Result result = zero; -#pragma omp parallel + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) { // initialize array for thread-local results Result r[ 4 ] = { zero, zero, zero, zero }; @@ -145,10 +146,11 @@ reduceWithArgument( const Index size, const int blocks = size / block_size; #ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable std::pair< Index, Result > result( -1, zero ); -#pragma omp parallel + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) { // initialize array for thread-local results Index arg[ 4 ] = { 0, 0, 0, 0 }; -- GitLab From 232be124a0f776ce48e663d1d474183f641e9549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 18:42:21 +0200 Subject: [PATCH 12/23] Benchmarks: added scalar multiplication with BLAS --- src/Benchmarks/BLAS/vector-operations.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 25fa055bf..c7f1cf751 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -435,6 +435,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto multiplyCuda = [&]() { deviceVector *= 0.5; }; +#ifdef HAVE_BLAS + auto multiplyBlas = [&]() { + blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 ); + }; +#endif #ifdef HAVE_CUDA auto multiplyCublas = [&]() { const Real alpha = 0.5; @@ -445,6 +450,9 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas ); +#endif #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); -- GitLab From e6e6cf46bd017d806a06ca138935cbf0721b6afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 19:08:43 +0200 Subject: [PATCH 13/23] Removed timing parameter from benchmarks Benchmarks can be easily profiled even without this parameter, so it was just an unnecessary complication. --- src/Benchmarks/Benchmarks.h | 48 +++++++++------------------------- src/Benchmarks/FunctionTimer.h | 33 +++++++++-------------- 2 files changed, 25 insertions(+), 56 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 42b19814e..c4ae772a7 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -65,18 +65,17 @@ public: using Logging::MetadataMap; using Logging::MetadataColumns; using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >; - + Benchmark( int loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) {} - + static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< bool >( "reset", "Call reset function between loops.", true ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); - config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } @@ -85,7 +84,6 @@ public: this->loops = parameters.getParameter< int >( "loops" ); this->reset = parameters.getParameter< bool >( "reset" ); this->minTime = parameters.getParameter< double >( "min-time" ); - this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< int >( "verbose" ); Logging::setVerbose( verbose ); } @@ -96,7 +94,7 @@ public: { this->loops = loops; } - + void setMinTime( const double& minTime ) { this->minTime = minTime; @@ -121,7 +119,6 @@ public: metadata["loops"] = convertToString(loops); metadata["reset"] = convertToString( reset ); metadata["minimal test time"] = convertToString( minTime ); - metadata["timing"] = convertToString( timing ); writeMetadata( metadata ); } @@ -208,28 +205,16 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - if( this->timing ) - if( this->reset ) - result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - if( this->reset ) - result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { - if( this->timing ) - if( this->reset ) - result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + if( this->reset ) + result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - if( this->reset ) - result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } this->performedLoops = functionTimer.getPerformedLoops(); } @@ -248,7 +233,7 @@ public: return this->baseTime; } - template< typename Device, + template< typename Device, typename ResetFunction, typename ComputeFunction, typename... NextComputations > @@ -277,16 +262,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - if( this->timing ) - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { - if( this->timing ) - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -304,7 +283,7 @@ public: return this->baseTime; } - template< typename Device, + template< typename Device, typename ComputeFunction, typename... NextComputations > inline double @@ -345,7 +324,6 @@ protected: double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; - bool timing = true; bool reset = true; SolverMonitorType monitor; }; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index a76ebd558..1fad1e946 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -28,8 +28,7 @@ class FunctionTimer public: using DeviceType = Device; - template< bool timing, - typename ComputeFunction, + template< typename ComputeFunction, typename ResetFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > double @@ -61,26 +60,24 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - if( timing ) - timer.start(); + timer.start(); for( loops = 0; - loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) + loops < maxLoops || timer.getRealTime() < minTime; + loops++ ) compute(); // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - if( timing ) - timer.stop(); + timer.stop(); } else { for( loops = 0; - loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) + loops < maxLoops || timer.getRealTime() < minTime; + loops++ ) { // abuse the monitor's "time" for loops monitor.setTime( loops + 1 ); @@ -91,25 +88,19 @@ class FunctionTimer if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - if( timing ) - timer.start(); + timer.start(); compute(); #ifdef HAVE_CUDA if( std::is_same< Device, Devices::Cuda >::value ) cudaDeviceSynchronize(); #endif - if( timing ) - timer.stop(); + timer.stop(); } } - if( timing ) - return timer.getRealTime() / ( double ) loops; - else - return std::numeric_limits::quiet_NaN(); + return timer.getRealTime() / ( double ) loops; } - template< bool timing, - typename ComputeFunction, + template< typename ComputeFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > double timeFunction( ComputeFunction compute, @@ -119,7 +110,7 @@ class FunctionTimer Monitor && monitor = Monitor() ) { auto noReset = [] () {}; - return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } int getPerformedLoops() const -- GitLab From 2bea9311ab9ed1211fb2dcaf6478312c79c9a7ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 19:53:58 +0200 Subject: [PATCH 14/23] Benchmarks: compute sample standard deviation of the measured computation times --- src/Benchmarks/Benchmarks.h | 23 ++- src/Benchmarks/FunctionTimer.h | 160 +++++++-------- src/Benchmarks/Logging.h | 365 ++++++++++++++++----------------- 3 files changed, 271 insertions(+), 277 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index c4ae772a7..683a18376 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -17,7 +17,6 @@ #include "Logging.h" #include -#include #include #include @@ -35,24 +34,24 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; - struct BenchmarkResult { using HeaderElements = Logging::HeaderElements; using RowElements = Logging::RowElements; - double bandwidth = std::numeric_limits::quiet_NaN(); double time = std::numeric_limits::quiet_NaN(); + double stddev = std::numeric_limits::quiet_NaN(); + double bandwidth = std::numeric_limits::quiet_NaN(); double speedup = std::numeric_limits::quiet_NaN(); virtual HeaderElements getTableHeader() const { - return HeaderElements({"bandwidth", "time", "speedup"}); + return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" }); } virtual RowElements getRowElements() const { - return RowElements({ bandwidth, time, speedup }); + return RowElements({ time, stddev, stddev / time, bandwidth, speedup }); } }; @@ -200,21 +199,22 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + result.stddev = std::numeric_limits::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->reset ) - result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { if( this->reset ) - result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } this->performedLoops = functionTimer.getPerformedLoops(); } @@ -257,15 +257,16 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + result.stddev = std::numeric_limits::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { - result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index 1fad1e946..1edd61204 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -17,6 +17,7 @@ #include #include +#include #include namespace TNL { @@ -25,101 +26,94 @@ namespace Benchmarks { template< typename Device > class FunctionTimer { - public: - using DeviceType = Device; - - template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - double - timeFunction( ComputeFunction compute, - ResetFunction reset, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor(), - bool performReset = true ) +public: + // returns a pair of (mean, stddev) where mean is the arithmetic mean of the + // computation times and stddev is the sample standard deviation + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + std::pair< double, double > + timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + if( verbose > 1 ) + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + Containers::Vector< double > results( maxLoops ); + results.setValue( 0.0 ); + + for( loops = 0; + loops < maxLoops || sum( results ) < minTime; + loops++ ) { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - if( verbose > 1 ) - monitor.setTimer( timer ); - - // warm up + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); reset(); - compute(); - // If we do not perform reset function and don't need - // the monitor, the timer is not interrupted after each loop. - if( ! performReset && verbose < 2 ) - { - // Explicit synchronization of the CUDA device + // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.start(); - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - loops++ ) - compute(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.stop(); - } - else - { - for( loops = 0; - loops < maxLoops || timer.getRealTime() < minTime; - loops++ ) - { - // abuse the monitor's "time" for loops - monitor.setTime( loops + 1 ); - reset(); - - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); + // reset timer before each computation + timer.reset(); + timer.start(); + compute(); #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.stop(); - } - } - return timer.getRealTime() / ( double ) loops; - } + timer.stop(); - template< typename ComputeFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - double - timeFunction( ComputeFunction compute, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - auto noReset = [] () {}; - return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + results[ loops ] = timer.getRealTime(); } - int getPerformedLoops() const - { - return this->loops; + const double mean = sum( results ) / (double) loops; + if( loops > 1 ) { + const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean ); + return std::make_pair( mean, stddev ); } - - protected: - int loops; + else { + const double stddev = std::numeric_limits::quiet_NaN(); + return std::make_pair( mean, stddev ); + } + } + + // returns a pair of (mean, stddev) where mean is the arithmetic mean of the + // computation times and stddev is the sample standard deviation + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + std::pair< double, double > + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor ); + } + + int getPerformedLoops() const + { + return this->loops; + } + +protected: + int loops; }; } // namespace Benchmarks diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h index b10ab7199..61608d364 100644 --- a/src/Benchmarks/Logging.h +++ b/src/Benchmarks/Logging.h @@ -16,225 +16,224 @@ #include #include #include +#include #include #include +#include + namespace TNL { - namespace Benchmarks { +namespace Benchmarks { class Logging { - public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using HeaderElements = std::vector< String >; - using RowElements = std::vector< double >; - - Logging( int verbose = true ) - : verbose(verbose) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { +public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector; + + using HeaderElements = std::vector< String >; + using RowElements = std::vector< double >; + + Logging( int verbose = true ) + : verbose(verbose) + {} + + void + setVerbose( int verbose) + { + this->verbose = verbose; + } + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; + std::cout << std::setw( 20 ) << it.first; } - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; - log << header_indent << " " << spanningElement << std::endl; for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; + std::cout << std::setw( 15 ) << it; } + std::cout << std::endl; - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } + header_changed = false; } - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; } - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) while( horizontalGroups.back().second <= 0 ) { horizontalGroups.pop_back(); header_indent.pop_back(); } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; } + } - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; } - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); } + } - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; } - return false; + std::cout << std::endl; } - protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; } - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); - } // namespace Benchmarks -} // namespace TNL + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + +protected: + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + int verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; +} // namespace Benchmarks +} // namespace TNL -- GitLab From 7cc55deee1cb6f5610d152ed2fe7e9eec48033af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 22:18:47 +0200 Subject: [PATCH 15/23] Implemented parallel prefix-sum with OpenMP Fixes #42 --- src/TNL/Containers/Algorithms/PrefixSum.hpp | 71 +++++++++++++++++++-- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index 1ffd4cf4e..59c160bf1 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -12,6 +12,8 @@ #pragma once +#include // std::unique_ptr + #include "PrefixSum.h" //#define CUDA_REDUCTION_PROFILING @@ -56,21 +58,76 @@ perform( Vector& v, using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - // TODO: parallelize with OpenMP - if( Type == PrefixSumType::Inclusive ) +#ifdef HAVE_OPENMP + const int threads = Devices::Host::getMaxThreadsCount(); + std::unique_ptr< RealType[] > block_sums{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new RealType[ static_cast(threads) + 1 ] + #else + new RealType[ threads + 1 ] + #endif + }; + block_sums[ 0 ] = zero; + + #pragma omp parallel + { + // init + const int thread_idx = omp_get_thread_num(); + RealType block_sum = zero; + + // perform prefix-sum on blocks statically assigned to threads + if( Type == PrefixSumType::Inclusive ) { + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) { + reduction( block_sum, v[ i ] ); + v[ i ] = block_sum; + } + } + else { + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) { + const RealType x = v[ i ]; + v[ i ] = block_sum; + reduction( block_sum, x ); + } + } + + // write the block sums into the buffer + block_sums[ thread_idx + 1 ] = block_sum; + #pragma omp barrier + + // calculate per-block offsets + RealType offset = 0; + for( int i = 0; i < thread_idx + 1; i++ ) + reduction( offset, block_sums[ i ] ); + + // shift intermediate results by the offset + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) + reduction( v[ i ], offset ); + } +#else + if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) reduction( v[ i ], v[ i - 1 ] ); + } else // Exclusive prefix sum { - RealType aux( v[ begin ] ); - v[ begin ] = zero; - for( IndexType i = begin + 1; i < end; i++ ) - { - RealType x = v[ i ]; + RealType aux = zero; + for( IndexType i = begin; i < end; i++ ) { + const RealType x = v[ i ]; v[ i ] = aux; reduction( aux, x ); } } +#endif } //// -- GitLab From 27631930833ddf68032b41afb2aafc06fdaa66e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 22:19:11 +0200 Subject: [PATCH 16/23] Added prefix-sum to BLAS benchmarks --- src/Benchmarks/BLAS/vector-operations.h | 62 +++++++++++-------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index c7f1cf751..80c63020d 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normCudaET = [&]() { resultDevice = lpNorm( deviceView, 3.0 ); }; - benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET ); @@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductCudaET = [&]() { resultDevice = ( deviceView, deviceView2 ); }; - #ifdef HAVE_BLAS auto scalarProductBlas = [&]() { resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 ); @@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif - //// - // Prefix sum - /* - std::cout << "Benchmarking prefix-sum:" << std::endl; - timer.reset(); - timer.start(); - hostVector.computePrefixSum(); - timer.stop(); - timeHost = timer.getTime(); - bandwidth = 2 * datasetSize / timer.getTime(); - std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - - timer.reset(); - timer.start(); - deviceVector.computePrefixSum(); - timer.stop(); - timeDevice = timer.getTime(); - bandwidth = 2 * datasetSize / timer.getTime(); - std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; - - HostVector auxHostVector; - auxHostVector.setLike( deviceVector ); - auxHostVector = deviceVector; - for( int i = 0; i < size; i++ ) - if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) - { - std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; - } - */ - - //// // Scalar multiplication auto multiplyHost = [&]() { @@ -614,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas ); #endif + //// + // Inclusive prefix sum + auto inclusivePrefixSumHost = [&]() { + hostVector.prefixSum(); + }; + auto inclusivePrefixSumCuda = [&]() { + deviceVector.prefixSum(); + }; + benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda ); +#endif + + //// + // Exclusive prefix sum + auto exclusivePrefixSumHost = [&]() { + hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); + }; + auto exclusivePrefixSumCuda = [&]() { + deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); + }; + benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda ); +#endif + #ifdef HAVE_CUDA cublasDestroy( cublasHandle ); #endif -- GitLab From 8d0d2638648279850cd085dce774661fb0c307bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Wed, 14 Aug 2019 23:09:47 +0200 Subject: [PATCH 17/23] Removed volatile reduction from PrefixSum and updated the normal reduction operation Same changes as for the regular Reduction operation... --- .../Algorithms/CudaPrefixSumKernel.h | 81 +++++++++---------- src/TNL/Containers/Algorithms/PrefixSum.h | 26 ++---- src/TNL/Containers/Algorithms/PrefixSum.hpp | 42 ++++------ src/TNL/Containers/Vector.hpp | 10 +-- src/TNL/Containers/VectorView.hpp | 10 +-- .../Containers/VectorPrefixSumTest.h | 12 +-- 6 files changed, 71 insertions(+), 110 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 0bfbe80ba..38eeb6d8d 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -24,13 +24,11 @@ namespace Algorithms { #ifdef HAVE_CUDA template< typename Real, - typename Operation, - typename VolatileOperation, + typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, - Operation operation, - VolatileOperation volatileOperation, + Reduction reduction, const Real zero, const Index size, const Index elementsInBlock, @@ -40,8 +38,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Real gridShift ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); - volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; - volatile Real* warpSums = &auxData[ blockDim.x ]; + Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; + Real* warpSums = &auxData[ blockDim.x ]; const Index lastElementIdx = size - blockIdx.x * elementsInBlock; const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock ); @@ -70,7 +68,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } } if( blockIdx.x == 0 && threadIdx.x == 0 ) - operation( sharedData[ 0 ], gridShift ); + sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. @@ -90,10 +88,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { - operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = + reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx.x ] = - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer++; } @@ -102,9 +101,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, */ const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) + for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) - volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); + auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); + __syncwarp(); + } if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx.x ]; @@ -114,21 +115,23 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) + for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) - volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); + warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); + __syncwarp(); + } __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) - volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); + auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); + __syncthreads(); /*** * Store the result back in global memory. */ - __syncthreads(); idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { @@ -136,7 +139,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; - operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); + sharedData[ Devices::Cuda::getInterleaving( idx ) ] = + reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; idx += blockDim.x; } @@ -147,8 +151,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, if( prefixSumType == PrefixSumType::Exclusive ) { Real aux = zero; - operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); - operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); + aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); + aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); auxArray[ blockIdx.x ] = aux; } else @@ -157,10 +161,10 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } template< typename Real, - typename Operation, + typename Reduction, typename Index > __global__ void -cudaSecondPhaseBlockPrefixSum( Operation operation, +cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, const Index elementsInBlock, Real gridShift, @@ -174,7 +178,7 @@ cudaSecondPhaseBlockPrefixSum( Operation operation, Index readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { - operation( data[ readIdx + readOffset ], shift ); + data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; } } @@ -185,12 +189,10 @@ template< PrefixSumType prefixSumType, typename Index > struct CudaPrefixSumKernelLauncher { - template< typename Operation, - typename VolatileOperation > + template< typename Reduction > static void cudaRecursivePrefixSum( PrefixSumType prefixSumType_, - Operation& operation, - VolatileOperation& volatileOperation, + Reduction& reduction, const Real& zero, const Index size, const Index blockSize, @@ -221,8 +223,7 @@ struct CudaPrefixSumKernelLauncher const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, - operation, - volatileOperation, + reduction, zero, size, elementsInBlock, @@ -242,8 +243,7 @@ struct CudaPrefixSumKernelLauncher Real gridShift2 = zero; if( numberOfBlocks > 1 ) cudaRecursivePrefixSum( PrefixSumType::Inclusive, - operation, - volatileOperation, + reduction, zero, numberOfBlocks, blockSize, @@ -254,7 +254,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << " auxArray2 = " << auxArray2 << std::endl; cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> - ( operation, + ( reduction, size, elementsInBlock, gridShift, @@ -273,25 +273,21 @@ struct CudaPrefixSumKernelLauncher /**** * \brief Starts prefix sum in CUDA. * - * \tparam Operation operation to be performed on particular elements - addition usually - * \tparam VolatileOperation - volatile version of Operation + * \tparam Reduction reduction to be performed on particular elements - addition usually * \param size is number of elements to be scanned * \param blockSize is CUDA block size * \param deviceInput is pointer to input data on GPU * \param deviceOutput is pointer to resulting array, can be the same as input - * \param operation is instance of Operation - * \param volatileOperation is instance of VolatileOperation - * \param zero is neutral element for given Operation + * \param reduction is instance of Reduction + * \param zero is neutral element for given Reduction */ - template< typename Operation, - typename VolatileOperation > + template< typename Reduction > static void start( const Index size, const Index blockSize, const Real *deviceInput, Real* deviceOutput, - Operation& operation, - VolatileOperation& volatileOperation, + Reduction& reduction, const Real& zero ) { /**** @@ -319,8 +315,7 @@ struct CudaPrefixSumKernelLauncher //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, - operation, - volatileOperation, + reduction, zero, currentSize, blockSize, @@ -371,5 +366,3 @@ int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; } // namespace Algorithms } // namespace Containers } // namespace TNL - - diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h index 53d6d7f83..fb90ccb44 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.h +++ b/src/TNL/Containers/Algorithms/PrefixSum.h @@ -38,14 +38,12 @@ class PrefixSum< Devices::Host, Type > { public: template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ); }; @@ -54,14 +52,12 @@ class PrefixSum< Devices::Cuda, Type > { public: template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + typename Reduction > static void perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ); }; @@ -70,16 +66,14 @@ class SegmentedPrefixSum< Devices::Host, Type > { public: template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ); }; @@ -88,21 +82,17 @@ class SegmentedPrefixSum< Devices::Cuda, Type > { public: template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > static void perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ); }; - - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index 59c160bf1..0001a2f75 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -44,15 +44,13 @@ static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//25 // PrefixSum on host template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + typename Reduction > void PrefixSum< Devices::Host, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; @@ -86,7 +84,7 @@ perform( Vector& v, if( Type == PrefixSumType::Inclusive ) { #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) { - reduction( block_sum, v[ i ] ); + block_sum = reduction( block_sum, v[ i ] ); v[ i ] = block_sum; } } @@ -95,7 +93,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = block_sum; - reduction( block_sum, x ); + block_sum = reduction( block_sum, x ); } } @@ -106,17 +104,17 @@ perform( Vector& v, // calculate per-block offsets RealType offset = 0; for( int i = 0; i < thread_idx + 1; i++ ) - reduction( offset, block_sums[ i ] ); + offset = reduction( offset, block_sums[ i ] ); // shift intermediate results by the offset #pragma omp for schedule(static) for( IndexType i = begin; i < end; i++ ) - reduction( v[ i ], offset ); + v[ i ] = reduction( v[ i ], offset ); } #else if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) - reduction( v[ i ], v[ i - 1 ] ); + v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { @@ -124,7 +122,7 @@ perform( Vector& v, for( IndexType i = begin; i < end; i++ ) { const RealType x = v[ i ]; v[ i ] = aux; - reduction( aux, x ); + aux = reduction( aux, x ); } } #endif @@ -134,15 +132,13 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + typename Reduction > void PrefixSum< Devices::Cuda, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatileReduction, + const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; @@ -155,7 +151,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, - volatileReduction, zero ); #endif } @@ -165,8 +160,7 @@ perform( Vector& v, // PrefixSum on host template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Host, Type >:: @@ -174,8 +168,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, + const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; @@ -186,7 +179,7 @@ perform( Vector& v, { for( IndexType i = begin + 1; i < end; i++ ) if( ! flags[ i ] ) - reduction( v[ i ], v[ i - 1 ] ); + v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { @@ -198,7 +191,7 @@ perform( Vector& v, if( flags[ i ] ) aux = zero; v[ i ] = aux; - reduction( aux, x ); + aux = reduction( aux, x ); } } } @@ -207,8 +200,7 @@ perform( Vector& v, // PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Cuda, Type >:: @@ -216,8 +208,7 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatileReduction, + const Reduction& reduction, const typename Vector::RealType& zero ) { using RealType = typename Vector::RealType; @@ -231,7 +222,6 @@ perform( Vector& v, &v[ begin ], &v[ begin ], reduction, - volatileReduction, zero );*/ #endif } diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp index 525ba6380..a8c626ee5 100644 --- a/src/TNL/Containers/Vector.hpp +++ b/src/TNL/Containers/Vector.hpp @@ -175,10 +175,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, @@ -193,10 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp index 057d402d7..9cdde8ef2 100644 --- a/src/TNL/Containers/VectorView.hpp +++ b/src/TNL/Containers/VectorView.hpp @@ -125,10 +125,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 ); + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, @@ -142,10 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 ); + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h index aba569df7..3f0eee391 100644 --- a/src/UnitTests/Containers/VectorPrefixSumTest.h +++ b/src/UnitTests/Containers/VectorPrefixSumTest.h @@ -189,7 +189,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); @@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); -- GitLab From af6d1d6b6173b9ab3c725df2dc4cdbba9cae22a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 15 Aug 2019 13:22:42 +0200 Subject: [PATCH 18/23] Added default stream synchronizations after kernel launches in CudaPrefixSumKernel.h --- .../Containers/Algorithms/CudaPrefixSumKernel.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 38eeb6d8d..9d21e8970 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -220,7 +220,7 @@ struct CudaPrefixSumKernelLauncher */ const std::size_t sharedDataSize = elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; - const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); + const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( prefixSumType_, reduction, @@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher output, auxArray1.getData(), gridShift ); + cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; @@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher gridShift, auxArray2.getData(), output ); + cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; cudaMemcpy( &gridShift, @@ -284,11 +286,11 @@ struct CudaPrefixSumKernelLauncher template< typename Reduction > static void start( const Index size, - const Index blockSize, - const Real *deviceInput, - Real* deviceOutput, - Reduction& reduction, - const Real& zero ) + const Index blockSize, + const Real *deviceInput, + Real* deviceOutput, + Reduction& reduction, + const Real& zero ) { /**** * Compute the number of grids @@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher gridShift, &deviceInput[ gridOffset ], &deviceOutput[ gridOffset ] ); - TNL_CHECK_CUDA_DEVICE; } /*** - * Store the number of CUDA grids for a purpose of unit testing, i.e. + * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. */ gridsCount = numberOfGrids; -- GitLab From 1fe626408f7735f7c099a6c364736ff7a2ee8206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 15 Aug 2019 14:47:28 +0200 Subject: [PATCH 19/23] Replaced static member variables in CudaPrefixSumKernelLauncher with static getters --- .../Algorithms/CudaPrefixSumKernel.h | 41 ++++++++----------- src/TNL/Devices/Host.h | 12 +++--- .../Containers/VectorPrefixSumTest.h | 26 ++++++------ 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 9d21e8970..8ac802f96 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -297,8 +297,7 @@ struct CudaPrefixSumKernelLauncher */ const Index elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize(); - const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); + const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); Real gridShift = zero; //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; @@ -310,10 +309,10 @@ struct CudaPrefixSumKernelLauncher /**** * Compute current grid size and size of data to be scanned */ - const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; + const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; - if( currentSize / elementsInBlock > maxGridSize ) - currentSize = maxGridSize * elementsInBlock; + if( currentSize / elementsInBlock > maxGridSize() ) + currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, @@ -331,37 +330,31 @@ struct CudaPrefixSumKernelLauncher * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. */ - gridsCount = numberOfGrids; + gridsCount() = numberOfGrids; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ - static void setMaxGridSize( int newMaxGridSize ) { - maxGridSize = newMaxGridSize; + static int& maxGridSize() + { + static int maxGridSize = Devices::Cuda::getMaxGridSize(); + return maxGridSize; } - static void resetMaxGridSize() { - maxGridSize = Devices::Cuda::getMaxGridSize(); + static void resetMaxGridSize() + { + maxGridSize() = Devices::Cuda::getMaxGridSize(); } - static int maxGridSize; - - static int gridsCount; + static int& gridsCount() + { + static int gridsCount = -1; + return gridsCount; + } }; -template< PrefixSumType prefixSumType, - typename Real, - typename Index > -int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize(); - -template< PrefixSumType prefixSumType, - typename Real, - typename Index > -int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; - - #endif } // namespace Algorithms diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h index 649749f58..40f55711a 100644 --- a/src/TNL/Devices/Host.h +++ b/src/TNL/Devices/Host.h @@ -113,17 +113,19 @@ public: } protected: - static bool& ompEnabled() { + static bool& ompEnabled() + { #ifdef HAVE_OPENMP - static bool ompEnabled( true ); + static bool ompEnabled = true; #else - static bool ompEnabled( false ); + static bool ompEnabled = false; #endif return ompEnabled; } - static int& maxThreadsCount() { - static int maxThreadsCount( -1 ); + static int& maxThreadsCount() + { + static int maxThreadsCount = -1; return maxThreadsCount; } }; diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h index 3f0eee391..3cba026e3 100644 --- a/src/UnitTests/Containers/VectorPrefixSumTest.h +++ b/src/UnitTests/Containers/VectorPrefixSumTest.h @@ -74,11 +74,11 @@ TYPED_TEST( VectorTest, prefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 ); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; v = 0; v_host = -1; v.prefixSum(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -86,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v.prefixSum(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); @@ -94,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i + 1 ); @@ -102,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum ) v = 0; v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -110,7 +110,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); @@ -184,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); @@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); @@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); @@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); -- GitLab From ac2ee07e3822d6a403a58d2042f8da52786ed59d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 16 Aug 2019 14:51:03 +0200 Subject: [PATCH 20/23] CUDA prefix-sum: moved gridShift from the first phase to the second phase --- .../Algorithms/CudaPrefixSumKernel.h | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 8ac802f96..3153bae28 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, const Index elementsInBlock, const Real* input, Real* output, - Real* auxArray, - const Real gridShift ) + Real* auxArray ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; @@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; } } - if( blockIdx.x == 0 && threadIdx.x == 0 ) - sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. @@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { if( prefixSumType == PrefixSumType::Exclusive ) { - Real aux = zero; - aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); - aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); - auxArray[ blockIdx.x ] = aux; + auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], + sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; @@ -172,15 +167,13 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction, Real* data ) { if( blockIdx.x > 0 ) + gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); + const Index readOffset = blockIdx.x * elementsInBlock; + Index readIdx = threadIdx.x; + while( readIdx < elementsInBlock && readOffset + readIdx < size ) { - const Real shift = auxArray[ blockIdx.x - 1 ]; - const Index readOffset = blockIdx.x * elementsInBlock; - Index readIdx = threadIdx.x; - while( readIdx < elementsInBlock && readOffset + readIdx < size ) - { - data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); - readIdx += blockDim.x; - } + data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); + readIdx += blockDim.x; } } @@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher elementsInBlock, input, output, - auxArray1.getData(), - gridShift ); + auxArray1.getData() ); cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; @@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; - cudaMemcpy( &gridShift, - &auxArray2[ auxArraySize - 1 ], - sizeof( Real ), - cudaMemcpyDeviceToHost ); + gridShift = auxArray2.getElement( auxArraySize - 1 ); //std::cerr << "gridShift = " << gridShift << std::endl; - TNL_CHECK_CUDA_DEVICE; } /**** -- GitLab From 2c40015f50fe443132325e2aebcd0f14322646a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 16 Aug 2019 20:12:38 +0200 Subject: [PATCH 21/23] CUDA prefix-sum: separated the implementation of the first and second phase --- .../Algorithms/CudaPrefixSumKernel.h | 286 ++++++++++-------- src/TNL/Containers/Algorithms/PrefixSum.hpp | 30 +- 2 files changed, 177 insertions(+), 139 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 3153bae28..ae3bb84de 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -31,7 +31,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, Reduction reduction, const Real zero, const Index size, - const Index elementsInBlock, + const int elementsInBlock, const Real* input, Real* output, Real* auxArray ) @@ -46,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, /*** * Load data into the shared memory. */ - const Index blockOffset = blockIdx.x * elementsInBlock; - Index idx = threadIdx.x; + const int blockOffset = blockIdx.x * elementsInBlock; + int idx = threadIdx.x; if( prefixSumType == PrefixSumType::Exclusive ) { if( idx == 0 ) @@ -81,7 +81,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; } - Index chunkPointer( 1 ); + int chunkPointer = 1; while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { @@ -132,7 +132,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { - const Index chunkIdx = idx / chunkSize; + const int chunkIdx = idx / chunkSize; Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; @@ -161,18 +161,20 @@ template< typename Real, __global__ void cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, - const Index elementsInBlock, - Real gridShift, + const int elementsInBlock, + const Index gridIdx, + const Index maxGridSize, const Real* auxArray, - Real* data ) + Real* data, + Real shift ) { - if( blockIdx.x > 0 ) - gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] ); - const Index readOffset = blockIdx.x * elementsInBlock; - Index readIdx = threadIdx.x; + if( gridIdx > 0 || blockIdx.x > 0 ) + shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); + const int readOffset = blockIdx.x * elementsInBlock; + int readIdx = threadIdx.x; while( readIdx < elementsInBlock && readOffset + readIdx < size ) { - data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift ); + data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); readIdx += blockDim.x; } } @@ -182,143 +184,183 @@ template< PrefixSumType prefixSumType, typename Index > struct CudaPrefixSumKernelLauncher { + /**** + * \brief Performs both phases of prefix sum. + * + * \param size Number of elements to be scanned. + * \param deviceInput Pointer to input data on GPU. + * \param deviceOutput Pointer to output array on GPU, can be the same as input. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param zero Neutral element for given reduction operation, i.e. value such that + * `reduction(zero, x) == x` for any `x`. + * \param blockSize The CUDA block size to be used for kernel launch. + */ template< typename Reduction > static void - cudaRecursivePrefixSum( PrefixSumType prefixSumType_, - Reduction& reduction, - const Real& zero, - const Index size, - const Index blockSize, - const Index elementsInBlock, - Real& gridShift, - const Real* input, - Real* output ) + perform( const Index size, + const Real* deviceInput, + Real* deviceOutput, + Reduction& reduction, + const Real zero, + const int blockSize = 256 ) + { + const auto blockShifts = performFirstPhase( + size, + deviceInput, + deviceOutput, + reduction, + zero, + blockSize ); + performSecondPhase( + size, + deviceOutput, + blockShifts.getData(), + reduction, + zero, + blockSize ); + } + + /**** + * \brief Performs the first phase of prefix sum. + * + * \param size Number of elements to be scanned. + * \param deviceInput Pointer to input data on GPU. + * \param deviceOutput Pointer to output array on GPU, can be the same as input. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param zero Neutral value for given reduction operation, i.e. value such that + * `reduction(zero, x) == x` for any `x`. + * \param blockSize The CUDA block size to be used for kernel launch. + */ + template< typename Reduction > + static auto + performFirstPhase( const Index size, + const Real* deviceInput, + Real* deviceOutput, + Reduction& reduction, + const Real zero, + const int blockSize = 256 ) { + // compute the number of grids + const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - const Index auxArraySize = numberOfBlocks; - - Array< Real, Devices::Cuda > auxArray1, auxArray2; - auxArray1.setSize( auxArraySize ); - auxArray2.setSize( auxArraySize ); - - /**** - * Setup block and grid size. - */ - dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); - cudaBlockSize.x = blockSize; - cudaGridSize.x = roundUpDivision( size, elementsInBlock ); - - /**** - * Run the kernel. - */ - const std::size_t sharedDataSize = elementsInBlock + - elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; - const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); - cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( prefixSumType_, - reduction, - zero, - size, - elementsInBlock, - input, - output, - auxArray1.getData() ); + const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); + //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; + + // allocate array for the block sums + Array< Real, Devices::Cuda > blockSums; + blockSums.setSize( numberOfBlocks ); + + // loop over all grids + for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { + // compute current grid size and size of data to be scanned + const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; + Index currentSize = size - gridOffset; + if( currentSize / elementsInBlock > maxGridSize() ) + currentSize = maxGridSize() * elementsInBlock; + //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; + + // setup block and grid size + dim3 cudaBlockSize, cudaGridSize; + cudaBlockSize.x = blockSize; + cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); + + // run the kernel + const std::size_t sharedDataSize = elementsInBlock + + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; + const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); + cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( prefixSumType, + reduction, + zero, + currentSize, + elementsInBlock, + &deviceInput[ gridOffset ], + &deviceOutput[ gridOffset ], + &blockSums[ gridIdx * maxGridSize() ] ); + } + + // synchronize the null-stream after all grids cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; - - //std::cerr << " auxArray1 = " << auxArray1 << std::endl; - /*** - * In auxArray1 there is now a sum of numbers in each block. - * We must compute prefix-sum of auxArray1 and then shift - * each block. - */ - Real gridShift2 = zero; - if( numberOfBlocks > 1 ) - cudaRecursivePrefixSum( PrefixSumType::Inclusive, + // blockSums now contains sums of numbers in each block. The first phase + // ends by computing prefix-sum of this array. + if( numberOfBlocks > 1 ) { + CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform( + blockSums.getSize(), + blockSums.getData(), + blockSums.getData(), reduction, zero, - numberOfBlocks, - blockSize, - elementsInBlock, - gridShift2, - auxArray1.getData(), - auxArray2.getData() ); - - //std::cerr << " auxArray2 = " << auxArray2 << std::endl; - cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> - ( reduction, - size, - elementsInBlock, - gridShift, - auxArray2.getData(), - output ); - cudaStreamSynchronize(0); - TNL_CHECK_CUDA_DEVICE; + blockSize ); + } + + // Store the number of CUDA grids for the purpose of unit testing, i.e. + // to check if we test the algorithm with more than one CUDA grid. + gridsCount() = numberOfGrids; - gridShift = auxArray2.getElement( auxArraySize - 1 ); - //std::cerr << "gridShift = " << gridShift << std::endl; + // blockSums now contains shift values for each block - to be used in the second phase + return blockSums; } /**** - * \brief Starts prefix sum in CUDA. + * \brief Performs the seocond phase of prefix sum. * - * \tparam Reduction reduction to be performed on particular elements - addition usually - * \param size is number of elements to be scanned - * \param blockSize is CUDA block size - * \param deviceInput is pointer to input data on GPU - * \param deviceOutput is pointer to resulting array, can be the same as input - * \param reduction is instance of Reduction - * \param zero is neutral element for given Reduction + * \param size Number of elements to be scanned. + * \param deviceOutput Pointer to output array on GPU. + * \param blockShifts Pointer to a GPU array containing the block shifts. It is the + * result of the first phase. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param shift A constant shifting all elements of the array (usually `zero`, i.e. + * the neutral value). + * \param blockSize The CUDA block size to be used for kernel launch. */ template< typename Reduction > static void - start( const Index size, - const Index blockSize, - const Real *deviceInput, - Real* deviceOutput, - Reduction& reduction, - const Real& zero ) + performSecondPhase( const Index size, + Real* deviceOutput, + const Real* blockShifts, + Reduction& reduction, + const Real shift, + const Index blockSize = 256 ) { - /**** - * Compute the number of grids - */ - const Index elementsInBlock = 8 * blockSize; + // compute the number of grids + const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); - Real gridShift = zero; - //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; - /**** - * Loop over all grids. - */ - for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) - { - /**** - * Compute current grid size and size of data to be scanned - */ + // loop over all grids + for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { + // compute current grid size and size of data to be scanned const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; - //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; - cudaRecursivePrefixSum( prefixSumType, - reduction, - zero, - currentSize, - blockSize, - elementsInBlock, - gridShift, - &deviceInput[ gridOffset ], - &deviceOutput[ gridOffset ] ); + + // setup block and grid size + dim3 cudaBlockSize, cudaGridSize; + cudaBlockSize.x = blockSize; + cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); + + // run the kernel + cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> + ( reduction, + size, + elementsInBlock, + gridIdx, + (Index) maxGridSize(), + blockShifts, + &deviceOutput[ gridOffset ], + shift ); } - /*** - * Store the number of CUDA grids for the purpose of unit testing, i.e. - * to check if we test the algorithm with more than one CUDA grid. - */ - gridsCount() = numberOfGrids; + // synchronize the null-stream after all grids + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; } /**** diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index 0001a2f75..807e507ef 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -141,17 +141,18 @@ perform( Vector& v, const Reduction& reduction, const typename Vector::RealType& zero ) { +#ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - using IndexType = typename Vector::IndexType; -#ifdef HAVE_CUDA - CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( - ( IndexType ) ( end - begin ), - ( IndexType ) 256, - &v[ begin ], - &v[ begin ], + + CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform( + end - begin, + &v[ begin ], // input + &v[ begin ], // output reduction, zero ); +#else + throw Exceptions::CudaSupportMissing(); #endif } @@ -211,18 +212,13 @@ perform( Vector& v, const Reduction& reduction, const typename Vector::RealType& zero ) { +#ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - using IndexType = typename Vector::IndexType; -#ifdef HAVE_CUDA - throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET - /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( - ( IndexType ) ( end - begin ), - ( IndexType ) 256, - &v[ begin ], - &v[ begin ], - reduction, - zero );*/ + + throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); +#else + throw Exceptions::CudaSupportMissing(); #endif } -- GitLab From 174ad5fd2494c3128560e0d451ba89883e5d1097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 16 Aug 2019 21:05:19 +0200 Subject: [PATCH 22/23] PrefixSum: separate first and second phase for OpenMP implementation and expose performFirstPhase and performSecondPhase methods --- src/TNL/Containers/Algorithms/PrefixSum.h | 128 ++++++++----- src/TNL/Containers/Algorithms/PrefixSum.hpp | 190 ++++++++++++++------ 2 files changed, 213 insertions(+), 105 deletions(-) diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h index fb90ccb44..2b0e40458 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.h +++ b/src/TNL/Containers/Algorithms/PrefixSum.h @@ -26,71 +26,107 @@ enum class PrefixSumType { template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class PrefixSum; +struct PrefixSum; template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class SegmentedPrefixSum; +struct SegmentedPrefixSum; template< PrefixSumType Type > -class PrefixSum< Devices::Host, Type > +struct PrefixSum< Devices::Host, Type > { - public: - template< typename Vector, - typename Reduction > - static void - perform( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - const Reduction& reduction, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction > + static void + perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename Reduction > + static auto + performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename BlockShifts, + typename Reduction > + static void + performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ); }; template< PrefixSumType Type > -class PrefixSum< Devices::Cuda, Type > +struct PrefixSum< Devices::Cuda, Type > { - public: - template< typename Vector, - typename Reduction > - static void - perform( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - const Reduction& reduction, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction > + static void + perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename Reduction > + static auto + performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename BlockShifts, + typename Reduction > + static void + performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ); }; template< PrefixSumType Type > -class SegmentedPrefixSum< Devices::Host, Type > +struct SegmentedPrefixSum< Devices::Host, Type > { - public: - template< typename Vector, - typename Reduction, - typename Flags > - static void - perform( Vector& v, - Flags& flags, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - const Reduction& reduction, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction, + typename Flags > + static void + perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); }; template< PrefixSumType Type > -class SegmentedPrefixSum< Devices::Cuda, Type > +struct SegmentedPrefixSum< Devices::Cuda, Type > { - public: - template< typename Vector, - typename Reduction, - typename Flags > - static void - perform( Vector& v, - Flags& flags, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - const Reduction& reduction, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction, + typename Flags > + static void + perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); }; } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index 807e507ef..8af19d09a 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -12,69 +12,58 @@ #pragma once -#include // std::unique_ptr - #include "PrefixSum.h" -//#define CUDA_REDUCTION_PROFILING - #include -#include -#include +#include #include +#include #include -#ifdef CUDA_REDUCTION_PROFILING -#include -#include -#endif - namespace TNL { namespace Containers { namespace Algorithms { -/**** - * Arrays smaller than the following constant - * are reduced on CPU. The constant must not be larger - * than maximal CUDA grid size. - */ -static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256; - -//// -// PrefixSum on host template< PrefixSumType Type > -template< typename Vector, - typename Reduction > + template< typename Vector, + typename Reduction > void PrefixSum< Devices::Host, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, - const typename Vector::RealType& zero ) + const typename Vector::RealType zero ) +{ +#ifdef HAVE_OPENMP + const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero ); + performSecondPhase( v, blockShifts, begin, end, reduction, zero ); +#else + // sequential prefix-sum does not need a second phase + performFirstPhase( v, begin, end, reduction, zero ); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename Reduction > +auto +PrefixSum< Devices::Host, Type >:: +performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) { using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; #ifdef HAVE_OPENMP const int threads = Devices::Host::getMaxThreadsCount(); - std::unique_ptr< RealType[] > block_sums{ - // Workaround for nvcc 10.1.168 - it would modifie the simple expression - // `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])` - // which is not correct - see e.g. https://stackoverflow.com/a/39671946 - // Thus, the host compiler would spit out hundreds of warnings... - // Funnily enough, nvcc's behaviour depends on the context rather than the - // expression, because exactly the same simple expression in different places - // does not produce warnings. - #ifdef __NVCC__ - new RealType[ static_cast(threads) + 1 ] - #else - new RealType[ threads + 1 ] - #endif - }; + Array< RealType, Devices::Host > block_sums( threads + 1 ); block_sums[ 0 ] = zero; - #pragma omp parallel + #pragma omp parallel num_threads(threads) { // init const int thread_idx = omp_get_thread_num(); @@ -99,18 +88,15 @@ perform( Vector& v, // write the block sums into the buffer block_sums[ thread_idx + 1 ] = block_sum; - #pragma omp barrier + } - // calculate per-block offsets - RealType offset = 0; - for( int i = 0; i < thread_idx + 1; i++ ) - offset = reduction( offset, block_sums[ i ] ); + // block_sums now contains sums of numbers in each block. The first phase + // ends by computing prefix-sum of this array. + for( int i = 1; i < threads + 1; i++ ) + block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] ); - // shift intermediate results by the offset - #pragma omp for schedule(static) - for( IndexType i = begin; i < end; i++ ) - v[ i ] = reduction( v[ i ], offset ); - } + // block_sums now contains shift values for each block - to be used in the second phase + return block_sums; #else if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) @@ -125,21 +111,57 @@ perform( Vector& v, aux = reduction( aux, x ); } } + + return 0; +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename BlockShifts, + typename Reduction > +void +PrefixSum< Devices::Host, Type >:: +performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ) +{ + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + +#ifdef HAVE_OPENMP + const int threads = blockShifts.getSize() - 1; + + // launch exactly the same number of threads as in the first phase + #pragma omp parallel num_threads(threads) + { + const int thread_idx = omp_get_thread_num(); + const RealType offset = reduction( blockShifts[ thread_idx ], shift ); + + // shift intermediate results by the offset + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) + v[ i ] = reduction( v[ i ], offset ); + } +#else + for( IndexType i = begin; i < end; i++ ) + v[ i ] = reduction( v[ i ], shift ); #endif } -//// -// PrefixSum on CUDA device template< PrefixSumType Type > -template< typename Vector, - typename Reduction > + template< typename Vector, + typename Reduction > void PrefixSum< Devices::Cuda, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, - const typename Vector::RealType& zero ) + const typename Vector::RealType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; @@ -156,9 +178,61 @@ perform( Vector& v, #endif } +template< PrefixSumType Type > + template< typename Vector, + typename Reduction > +auto +PrefixSum< Devices::Cuda, Type >:: +performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) +{ +#ifdef HAVE_CUDA + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase( + end - begin, + &v[ begin ], // input + &v[ begin ], // output + reduction, + zero ); +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename BlockShifts, + typename Reduction > +void +PrefixSum< Devices::Cuda, Type >:: +performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ) +{ +#ifdef HAVE_CUDA + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase( + end - begin, + &v[ begin ], // output + blockShifts.getData(), + reduction, + shift ); +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + -//// -// PrefixSum on host template< PrefixSumType Type > template< typename Vector, typename Reduction, @@ -170,7 +244,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, - const typename Vector::RealType& zero ) + const typename Vector::RealType zero ) { using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; @@ -197,8 +271,6 @@ perform( Vector& v, } } -//// -// PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, typename Reduction, @@ -210,7 +282,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, - const typename Vector::RealType& zero ) + const typename Vector::RealType zero ) { #ifdef HAVE_CUDA using RealType = typename Vector::RealType; -- GitLab From d13a2d1870e714dbccd92623edd2a1e139462d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Fri, 16 Aug 2019 22:27:06 +0200 Subject: [PATCH 23/23] Implemented distributed prefix-sum Fixes #43 --- .../Algorithms/DistributedPrefixSum.h | 70 +++++ src/TNL/Containers/DistributedVector.h | 9 +- src/TNL/Containers/DistributedVector.hpp | 42 +-- src/TNL/Containers/DistributedVectorView.h | 9 +- src/TNL/Containers/DistributedVectorView.hpp | 42 +-- .../Containers/DistributedVectorTest.h | 268 ++++++++++++++++-- .../Containers/VectorPrefixSumTest.h | 140 +++++---- 7 files changed, 418 insertions(+), 162 deletions(-) create mode 100644 src/TNL/Containers/Algorithms/DistributedPrefixSum.h diff --git a/src/TNL/Containers/Algorithms/DistributedPrefixSum.h b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h new file mode 100644 index 000000000..b81e2ac94 --- /dev/null +++ b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h @@ -0,0 +1,70 @@ +/*************************************************************************** + PrefixSum.h - description + ------------------- + begin : Aug 16, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< PrefixSumType Type > +struct DistributedPrefixSum +{ + template< typename DistributedVector, + typename Reduction > + static void + perform( DistributedVector& v, + typename DistributedVector::IndexType begin, + typename DistributedVector::IndexType end, + const Reduction& reduction, + const typename DistributedVector::RealType zero ) + { + using RealType = typename DistributedVector::RealType; + using DeviceType = typename DistributedVector::DeviceType; + using CommunicatorType = typename DistributedVector::CommunicatorType; + + const auto group = v.getCommunicationGroup(); + if( group != CommunicatorType::NullGroup ) { + // adjust begin and end for the local range + const auto localRange = v.getLocalRange(); + begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin(); + end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin(); + + // perform first phase on the local data + auto localView = v.getLocalView(); + const auto blockShifts = PrefixSum< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); + const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); + + // exchange local sums between ranks + const int nproc = CommunicatorType::GetSize( group ); + RealType dataForScatter[ nproc ]; + for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; + Vector< RealType, Devices::Host > rankSums( nproc ); + // NOTE: exchanging general data types does not work with MPI + CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); + + // compute prefix-sum of the per-rank sums + PrefixSum< Devices::Host, PrefixSumType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); + + // perform second phase: shift by the per-block and per-rank offsets + const int rank = CommunicatorType::GetRank( group ); + PrefixSum< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); + } + } +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h index 75f204dcc..3438ddbd0 100644 --- a/src/TNL/Containers/DistributedVector.h +++ b/src/TNL/Containers/DistributedVector.h @@ -127,13 +127,8 @@ public: typename = std::enable_if_t< HasSubscriptOperator::value > > DistributedVector& operator/=( const Vector& vector ); - void computePrefixSum(); - - void computePrefixSum( IndexType begin, IndexType end ); - - void computeExclusivePrefixSum(); - - void computeExclusivePrefixSum( IndexType begin, IndexType end ); + template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive > + void prefixSum( IndexType begin = 0, IndexType end = 0 ); }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index f50624c61..0a6ac1547 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -13,7 +13,7 @@ #pragma once #include "DistributedVector.h" -#include +#include namespace TNL { namespace Containers { @@ -298,44 +298,14 @@ template< typename Real, typename Device, typename Index, typename Communicator > + template< Algorithms::PrefixSumType Type > void DistributedVector< Real, Device, Index, Communicator >:: -computePrefixSum() +prefixSum( IndexType begin, IndexType end ) { - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum() -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); + if( end == 0 ) + end = this->getSize(); + Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } } // namespace Containers diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index 0ab923200..f0e7d9127 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -127,13 +127,8 @@ public: typename = std::enable_if_t< HasSubscriptOperator::value > > DistributedVectorView& operator/=( const Vector& vector ); - void computePrefixSum(); - - void computePrefixSum( IndexType begin, IndexType end ); - - void computeExclusivePrefixSum(); - - void computeExclusivePrefixSum( IndexType begin, IndexType end ); + template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive > + void prefixSum( IndexType begin = 0, IndexType end = 0 ); }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 2f1e4666e..0268e35da 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -13,7 +13,7 @@ #pragma once #include "DistributedVectorView.h" -#include +#include namespace TNL { namespace Containers { @@ -274,44 +274,14 @@ template< typename Real, typename Device, typename Index, typename Communicator > + template< Algorithms::PrefixSumType Type > void DistributedVectorView< Real, Device, Index, Communicator >:: -computePrefixSum() +prefixSum( IndexType begin, IndexType end ) { - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum() -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); + if( end == 0 ) + end = this->getSize(); + Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } } // namespace Containers diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index e2d07a02a..430a42d5e 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -43,32 +43,26 @@ protected: using VectorViewType = typename DistributedVectorType::LocalViewType; using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >; - const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; - DistributedVectorType x, y, z; - - DistributedVectorView x_view, y_view, z_view; + DistributedVectorType v; + DistributedVectorView v_view; + typename DistributedVectorType::HostType v_host; const int rank = CommunicatorType::GetRank(group); const int nproc = CommunicatorType::GetSize(group); + // should be small enough to have fast tests, but large enough to test + // prefix-sum with multiple CUDA grids + const int globalSize = 10000 * nproc; + DistributedVectorTest() { using LocalRangeType = typename DistributedVector::LocalRangeType; const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - x.setDistribution( localRange, globalSize, group ); - y.setDistribution( localRange, globalSize, group ); - z.setDistribution( localRange, globalSize, group ); - - x_view.bind( x ); - y_view.bind( y ); - z_view.bind( z ); - - setConstantSequence( x, 1 ); - setLinearSequence( y ); - setNegativeLinearSequence( z ); + v.setDistribution( localRange, globalSize, group ); + v_view.bind( v ); + setConstantSequence( v, 1 ); } }; @@ -85,7 +79,247 @@ using DistributedVectorTypes = ::testing::Types< TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes ); -// TODO: distributed prefix sum +#if 1 +TYPED_TEST( DistributedVectorTest, prefixSum ) +{ + using RealType = typename TestFixture::DistributedVectorType::RealType; + using DeviceType = typename TestFixture::DistributedVectorType::DeviceType; + using IndexType = typename TestFixture::DistributedVectorType::IndexType; + + auto& v = this->v; + auto& v_view = this->v_view; + auto& v_host = this->v_host; + const auto localRange = v.getLocalRange(); + + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; + + setConstantSequence( v, 0 ); + v_host = -1; + v.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.prefixSum(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v_view.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + //// + // With CUDA, perform tests with multiple CUDA grids. + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; + + setConstantSequence( v, 0 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ); + + setLinearSequence( v ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ); + + setLinearSequence( v ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); +#endif + } +} +#endif + +#if 1 +TYPED_TEST( DistributedVectorTest, exclusivePrefixSum ) +{ + using RealType = typename TestFixture::DistributedVectorType::RealType; + using DeviceType = typename TestFixture::DistributedVectorType::DeviceType; + using IndexType = typename TestFixture::DistributedVectorType::IndexType; + + auto& v = this->v; + auto& v_view = this->v_view; + auto& v_host = this->v_host; + const auto localRange = v.getLocalRange(); + + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; + + setConstantSequence( v, 0 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + //// + // With CUDA, perform tests with multiple CUDA grids. + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; + + setConstantSequence( v, 0 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ); + + setLinearSequence( v ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ); + + setLinearSequence( v ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); +#endif + } +} +#endif #endif // HAVE_GTEST diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h index 3cba026e3..4659d365d 100644 --- a/src/UnitTests/Containers/VectorPrefixSumTest.h +++ b/src/UnitTests/Containers/VectorPrefixSumTest.h @@ -26,48 +26,56 @@ TYPED_TEST( VectorTest, prefixSum ) using IndexType = typename VectorType::IndexType; const int size = VECTOR_TEST_SIZE; - if( std::is_same< RealType, float >::value || - std::is_same< IndexType, short >::value ) - return; + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; VectorType v( size ); ViewType v_view( v ); typename VectorType::HostType v_host( size ); - v = 0; + setConstantSequence( v, 0 ); v_host = -1; v.prefixSum(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + v_host = v_view; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.prefixSum(); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.prefixSum(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i + 1 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v = 0; + setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); v_host = v_view; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.prefixSum(); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; //// // With CUDA, perform tests with multiple CUDA grids. @@ -75,45 +83,56 @@ TYPED_TEST( VectorTest, prefixSum ) { #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; - v = 0; + + setConstantSequence( v, 0 ); v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i + 1 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v = 0; + setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); #endif } @@ -128,8 +147,8 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) using IndexType = typename VectorType::IndexType; const int size = VECTOR_TEST_SIZE; - if( std::is_same< RealType, float >::value || - std::is_same< IndexType, short >::value ) + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) return; VectorType v; @@ -137,47 +156,48 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) ViewType v_view( v ); typename VectorType::HostType v_host( size ); - setConstantSequence( v, 1 ); + setConstantSequence( v, 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; //// // With CUDA, perform tests with multiple CUDA grids. @@ -186,53 +206,55 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; - setConstantSequence( v, 1 ); + setConstantSequence( v, 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); #endif } -- GitLab