Loading src/TNL/Algorithms/Multireduction.h +12 −6 Original line number Diff line number Diff line Loading @@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential > typename Reduction, typename Index > static constexpr void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -58,7 +60,9 @@ struct Multireduction< Devices::Host > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -74,7 +78,7 @@ struct Multireduction< Devices::Host > typename Reduction, typename Index > static void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda > typename Reduction, typename Index > static void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading src/TNL/Algorithms/Multireduction.hpp +10 −10 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ template< typename Result, typename Index > void constexpr Multireduction< Devices::Sequential >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -53,7 +53,7 @@ reduce( const Result zero, // (it is accessed as a row-major matrix with n rows and 4 columns) Result r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = zero; r[ k ] = identity; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { Loading Loading @@ -89,7 +89,7 @@ reduce( const Result zero, } else { for( int k = 0; k < n; k++ ) result[ k ] = zero; result[ k ] = identity; for( int b = 0; b < blocks; b++ ) { const Index offset = b * block_size; Loading @@ -112,7 +112,7 @@ template< typename Result, typename Index > void Multireduction< Devices::Host >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -134,14 +134,14 @@ reduce( const Result zero, #pragma omp single nowait { for( int k = 0; k < n; k++ ) result[ k ] = zero; result[ k ] = identity; } // initialize array for thread-local results // (it is accessed as a row-major matrix with n rows and 4 columns) Result r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = zero; r[ k ] = identity; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { Loading Loading @@ -185,7 +185,7 @@ reduce( const Result zero, } else #endif Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result ); Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result ); } template< typename Result, Loading @@ -194,7 +194,7 @@ template< typename Result, typename Index > void Multireduction< Devices::Cuda >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -212,7 +212,7 @@ reduce( const Result zero, // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -234,7 +234,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading src/TNL/Algorithms/SegmentedScan.h +12 −9 Original line number Diff line number Diff line Loading @@ -81,8 +81,9 @@ struct SegmentedScan< Devices::Sequential, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -107,7 +108,7 @@ struct SegmentedScan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; template< detail::ScanType Type > Loading @@ -125,8 +126,9 @@ struct SegmentedScan< Devices::Host, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -151,7 +153,7 @@ struct SegmentedScan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; template< detail::ScanType Type > Loading @@ -169,8 +171,9 @@ struct SegmentedScan< Devices::Cuda, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -197,7 +200,7 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; } // namespace Algorithms Loading src/TNL/Algorithms/SegmentedScan.hpp +7 −7 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; Loading @@ -44,12 +44,12 @@ perform( Vector& v, else // Exclusive scan { ValueType aux( v[ begin ] ); v[ begin ] = zero; v[ begin ] = identity; for( IndexType i = begin + 1; i < end; i++ ) { ValueType x = v[ i ]; if( flags[ i ] ) aux = zero; aux = identity; v[ i ] = aux; aux = reduction( aux, x ); } Loading @@ -67,13 +67,13 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { #ifdef HAVE_OPENMP // TODO: parallelize with OpenMP SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity ); #else SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity ); #endif } Loading @@ -88,7 +88,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { #ifdef HAVE_CUDA using ValueType = typename Vector::ValueType; Loading src/TNL/Algorithms/detail/CudaMultireductionKernel.h +12 −12 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ template< int blockSizeX, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, CudaMultireductionKernel( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero, const int y = blockIdx.y * blockDim.y + threadIdx.y; if( y >= n ) return; sdata[ tid ] = zero; sdata[ tid ] = identity; // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSizeX < size ) { Loading Loading @@ -145,7 +145,7 @@ template< typename Result, typename Reduction, typename Index > int CudaMultireductionKernelLauncher( const Result zero, CudaMultireductionKernelLauncher( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading Loading @@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero, { case 512: CudaMultireductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); Loading Loading
src/TNL/Algorithms/Multireduction.h +12 −6 Original line number Diff line number Diff line Loading @@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential > typename Reduction, typename Index > static constexpr void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -58,7 +60,9 @@ struct Multireduction< Devices::Host > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -74,7 +78,7 @@ struct Multireduction< Devices::Host > typename Reduction, typename Index > static void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda > { /** * Parameters: * zero: starting value for reduction * identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction * dataFetcher: callable object such that `dataFetcher( i, j )` yields * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) Loading @@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda > typename Reduction, typename Index > static void reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading
src/TNL/Algorithms/Multireduction.hpp +10 −10 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ template< typename Result, typename Index > void constexpr Multireduction< Devices::Sequential >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -53,7 +53,7 @@ reduce( const Result zero, // (it is accessed as a row-major matrix with n rows and 4 columns) Result r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = zero; r[ k ] = identity; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { Loading Loading @@ -89,7 +89,7 @@ reduce( const Result zero, } else { for( int k = 0; k < n; k++ ) result[ k ] = zero; result[ k ] = identity; for( int b = 0; b < blocks; b++ ) { const Index offset = b * block_size; Loading @@ -112,7 +112,7 @@ template< typename Result, typename Index > void Multireduction< Devices::Host >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -134,14 +134,14 @@ reduce( const Result zero, #pragma omp single nowait { for( int k = 0; k < n; k++ ) result[ k ] = zero; result[ k ] = identity; } // initialize array for thread-local results // (it is accessed as a row-major matrix with n rows and 4 columns) Result r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = zero; r[ k ] = identity; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { Loading Loading @@ -185,7 +185,7 @@ reduce( const Result zero, } else #endif Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result ); Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result ); } template< typename Result, Loading @@ -194,7 +194,7 @@ template< typename Result, typename Index > void Multireduction< Devices::Cuda >:: reduce( const Result zero, reduce( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -212,7 +212,7 @@ reduce( const Result zero, // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -234,7 +234,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading
src/TNL/Algorithms/SegmentedScan.h +12 −9 Original line number Diff line number Diff line Loading @@ -81,8 +81,9 @@ struct SegmentedScan< Devices::Sequential, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -107,7 +108,7 @@ struct SegmentedScan< Devices::Sequential, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; template< detail::ScanType Type > Loading @@ -125,8 +126,9 @@ struct SegmentedScan< Devices::Host, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -151,7 +153,7 @@ struct SegmentedScan< Devices::Host, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; template< detail::ScanType Type > Loading @@ -169,8 +171,9 @@ struct SegmentedScan< Devices::Cuda, Type > * \param begin the first element in the array to be scanned * \param end the last element in the array to be scanned * \param reduction lambda function implementing the reduction operation * \param zero is the idempotent element for the reduction operation, i.e. element which * does not change the result of the reduction. * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element) * for the reduction operation, i.e. element which does not * change the result of the reduction. * * The reduction lambda function takes two variables which are supposed to be reduced: * Loading @@ -197,7 +200,7 @@ struct SegmentedScan< Devices::Cuda, Type > const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ); const typename Vector::ValueType identity ); }; } // namespace Algorithms Loading
src/TNL/Algorithms/SegmentedScan.hpp +7 −7 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { using ValueType = typename Vector::ValueType; using IndexType = typename Vector::IndexType; Loading @@ -44,12 +44,12 @@ perform( Vector& v, else // Exclusive scan { ValueType aux( v[ begin ] ); v[ begin ] = zero; v[ begin ] = identity; for( IndexType i = begin + 1; i < end; i++ ) { ValueType x = v[ i ]; if( flags[ i ] ) aux = zero; aux = identity; v[ i ] = aux; aux = reduction( aux, x ); } Loading @@ -67,13 +67,13 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { #ifdef HAVE_OPENMP // TODO: parallelize with OpenMP SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity ); #else SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity ); #endif } Loading @@ -88,7 +88,7 @@ perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, const Reduction& reduction, const typename Vector::ValueType zero ) const typename Vector::ValueType identity ) { #ifdef HAVE_CUDA using ValueType = typename Vector::ValueType; Loading
src/TNL/Algorithms/detail/CudaMultireductionKernel.h +12 −12 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ template< int blockSizeX, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, CudaMultireductionKernel( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading @@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero, const int y = blockIdx.y * blockDim.y + threadIdx.y; if( y >= n ) return; sdata[ tid ] = zero; sdata[ tid ] = identity; // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSizeX < size ) { Loading Loading @@ -145,7 +145,7 @@ template< typename Result, typename Reduction, typename Index > int CudaMultireductionKernelLauncher( const Result zero, CudaMultireductionKernelLauncher( const Result identity, DataFetcher dataFetcher, const Reduction reduction, const Index size, Loading Loading @@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero, { case 512: CudaMultireductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); Loading