Refactored CudaReductionKernel - split the parallel reduction into CudaBlockReduce (3bd8fff5) · Commits · TNL / tnl-dev

src/TNL/Algorithms/detail/CudaReductionKernel.h

+288 −187

Original line number	Diff line number	Diff line
		@@ -23,23 +23,7 @@ namespace TNL {
		namespace Algorithms {
		namespace detail {

		/****
		* The performance of this kernel is very sensitive to register usage.
		* Compile with --ptxas-options=-v and configure these constants for given
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value

		#ifdef HAVE_CUDA
		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ == 750 )
		// Turing has a limit of 1024 threads per multiprocessor
		static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
		#else
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#endif

		/*
		* nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
		* std::plus<void>::operator() function to be constexpr and hence __host__ __device__
		@@ -64,236 +48,353 @@ auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg
		#endif
		}

		/* Template for cooperative reduction across the CUDA block of threads.
		* It is a cooperative operation - all threads must call the operation,
		* otherwise it will deadlock!
		*
		* The default implementation is generic and the reduction is done using
		* shared memory. Specializations can be made based on `Reduction` and
		* `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
		* value types.
		*/
		template< int blockSize,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index begin,
		const Index end,
		Result* output )
		typename ValueType >
		struct CudaBlockReduce
		{
		// storage to be allocated in shared memory
		struct Storage
		{
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
		__shared__ Result sdata[shmemElements];

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const Index tid = threadIdx.x;
		Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
		const Index gridSize = blockDim.x * gridDim.x;

		sdata[ tid ] = zero;
		ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
		};

		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSize < end ) {
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < end ) {
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
		gid += 2 * gridSize;
		}
		while( gid < end ) {
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
		gid += gridSize;
		}
		/* Cooperative reduction across the CUDA block - each thread will get the
		* result of the reduction
		*
		* \param reduction The binary reduction functor.
		* \param threadValue Value of the calling thread to be reduced.
		* \param tid Index of the calling thread (usually `threadIdx.x`,
		* unless you know what you are doing).
		* \param storage Auxiliary storage (must be allocated as a __shared__
		* variable).
		*/
		__device__ static
		ValueType
		reduce( const Reduction& reduction,
		ValueType threadValue,
		int tid,
		Storage& storage )
		{
		storage.data[ tid ] = threadValue;
		__syncthreads();

		// Perform the parallel reduction.
		if( blockSize >= 1024 ) {
		if( tid < 512 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 ) {
		if( tid < 256 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 ) {
		if( tid < 128 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSize >= 128 ) {
		if( tid < 64 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( tid < 32 ) {
		if( blockSize >= 64 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 16 ] );
		__syncwarp();
		if( blockSize >= 16 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 8 ] );
		__syncwarp();
		if( blockSize >= 8 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 4 ] );
		__syncwarp();
		if( blockSize >= 4 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 2 ] );
		__syncwarp();
		if( blockSize >= 2 )
		sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] );
		storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 1 ] );
		}

		// Store the result back in the global memory.
		if( tid == 0 )
		output[ blockIdx.x ] = sdata[ 0 ];
		__syncthreads();
		return storage.data[ 0 ];
		}
		};

		/* Template for cooperative reduction with argument across the CUDA block of
		* threads. It is a cooperative operation - all threads must call the
		* operation, otherwise it will deadlock!
		*
		* The default implementation is generic and the reduction is done using
		* shared memory. Specializations can be made based on `Reduction` and
		* `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
		* value types.
		*/
		template< int blockSize,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionWithArgumentKernel( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const Index begin,
		const Index end,
		Result* output,
		Index* idxOutput,
		const Index* idxInput = nullptr )
		typename ValueType,
		typename IndexType >
		struct CudaBlockReduceWithArgument
		{
		// storage to be allocated in shared memory
		struct Storage
		{
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
		__shared__ Result sdata[shmemElements];
		__shared__ Index sidx[shmemElements];

		// Get the thread id (tid), global thread id (gid) and gridSize.
		const Index tid = threadIdx.x;
		Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
		const Index gridSize = blockDim.x * gridDim.x;
		ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
		IndexType idx [ (blockSize <= 32) ? 2 * blockSize : blockSize ];
		};

		// Start with the sequential reduction and push the result into the shared memory.
		if( idxInput ) {
		if( gid < end ) {
		sdata[ tid ] = dataFetcher( gid );
		sidx[ tid ] = idxInput[ gid ];
		gid += gridSize;
		} else {
		sdata[ tid ] = zero;
		}
		while( gid + 4 * gridSize < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
		gid += 2 * gridSize;
		}
		while( gid < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
		gid += gridSize;
		}
		}
		else {
		if( gid < end ) {
		sdata[ tid ] = dataFetcher( gid );
		sidx[ tid ] = gid;
		gid += gridSize;
		} else {
		sdata[ tid ] = zero;
		}
		while( gid + 4 * gridSize < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize );
		gid += 4 * gridSize;
		}
		while( gid + 2 * gridSize < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
		gid += 2 * gridSize;
		}
		while( gid < end ) {
		reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
		gid += gridSize;
		}
		}
		/* Cooperative reduction with argument across the CUDA block - each thread
		* will get the pair of the result of the reduction and the index
		*
		* \param reduction The binary reduction functor.
		* \param threadValue Value of the calling thread to be reduced.
		* \param threadIndex Index value of the calling thread to be reduced.
		* \param tid Index of the calling thread (usually `threadIdx.x`,
		* unless you know what you are doing).
		* \param storage Auxiliary storage (must be allocated as a __shared__
		* variable).
		*/
		__device__ static
		std::pair< ValueType, IndexType >
		reduceWithArgument( const Reduction& reduction,
		ValueType threadValue,
		IndexType threadIndex,
		int tid,
		Storage& storage )
		{
		storage.data[ tid ] = threadValue;
		storage.idx[ tid ] = threadIndex;
		__syncthreads();

		// Perform the parallel reduction.
		if( blockSize >= 1024 ) {
		if( tid < 512 )
		reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 512 ], storage.idx[ tid ], storage.idx[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSize >= 512 ) {
		if( tid < 256 )
		reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 256 ], storage.idx[ tid ], storage.idx[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSize >= 256 ) {
		if( tid < 128 )
		reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 128 ], storage.idx[ tid ], storage.idx[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSize >= 128 ) {
		if( tid < 64 )
		reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 64 ], storage.idx[ tid ], storage.idx[ tid + 64 ] );
		__syncthreads();
		}

		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( tid < 32 ) {
		if( blockSize >= 64 )
		reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 32 ], storage.idx[ tid ], storage.idx[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSize >= 32 )
		reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 16 ], storage.idx[ tid ], storage.idx[ tid + 16 ] );
		__syncwarp();
		if( blockSize >= 16 )
		reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 8 ], storage.idx[ tid ], storage.idx[ tid + 8 ] );
		__syncwarp();
		if( blockSize >= 8 )
		reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 4 ], storage.idx[ tid ], storage.idx[ tid + 4 ] );
		__syncwarp();
		if( blockSize >= 4 )
		reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 2 ], storage.idx[ tid ], storage.idx[ tid + 2 ] );
		__syncwarp();
		if( blockSize >= 2 )
		reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] );
		reduction( storage.data[ tid ], storage.data[ tid + 1 ], storage.idx[ tid ], storage.idx[ tid + 1 ] );
		}

		__syncthreads();
		return std::make_pair( storage.data[ 0 ], storage.idx[ 0 ] );
		}
		};
		#endif

		/****
		* The performance of this kernel is very sensitive to register usage.
		* Compile with --ptxas-options=-v and configure these constants for given
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value

		#ifdef HAVE_CUDA
		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ == 750 )
		// Turing has a limit of 1024 threads per multiprocessor
		static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
		#else
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#endif

		template< int blockSize,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( Result initialValue,
		DataFetcher dataFetcher,
		const Reduction reduction,
		Index begin,
		Index end,
		Result* output )
		{
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );

		// allocate shared memory
		using BlockReduce = CudaBlockReduce< blockSize, Reduction, Result >;
		__shared__ typename BlockReduce::Storage storage;

		// Calculate the grid size (stride of the sequential reduction loop).
		const Index gridSize = blockDim.x * gridDim.x;
		// Shift the input lower bound by the thread index in the grid.
		begin += blockIdx.x * blockDim.x + threadIdx.x;

		// Start with the sequential reduction and push the result into the shared memory.
		while( begin + 4 * gridSize < end ) {
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 2 * gridSize ) );
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 3 * gridSize ) );
		begin += 4 * gridSize;
		}
		while( begin + 2 * gridSize < end ) {
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
		begin += 2 * gridSize;
		}
		while( begin < end ) {
		initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
		begin += gridSize;
		}
		__syncthreads();

		// Perform the parallel reduction.
		initialValue = BlockReduce::reduce( reduction, initialValue, threadIdx.x, storage );

		// Store the result back in the global memory.
		if( threadIdx.x == 0 )
		output[ blockIdx.x ] = initialValue;
		}

		template< int blockSize,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename Index >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionWithArgumentKernel( Result initialValue,
		DataFetcher dataFetcher,
		const Reduction reduction,
		Index begin,
		Index end,
		Result* output,
		Index* idxOutput,
		const Index* idxInput = nullptr )
		{
		TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );

		// allocate shared memory
		using BlockReduce = CudaBlockReduceWithArgument< blockSize, Reduction, Result, Index >;
		__shared__ typename BlockReduce::Storage storage;

		// Calculate the grid size (stride of the sequential reduction loop).
		const Index gridSize = blockDim.x * gridDim.x;
		// Shift the input lower bound by the thread index in the grid.
		begin += blockIdx.x * blockDim.x + threadIdx.x;

		// TODO: initialIndex should be passed as an argument to the kernel
		Index initialIndex;

		// Start with the sequential reduction and push the result into the shared memory.
		if( idxInput ) {
		if( begin < end ) {
		initialValue = dataFetcher( begin );
		initialIndex = idxInput[ begin ];
		begin += gridSize;
		}
		while( begin + 4 * gridSize < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
		reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
		reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, idxInput[ begin + 2 * gridSize ] );
		reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, idxInput[ begin + 3 * gridSize ] );
		begin += 4 * gridSize;
		}
		while( begin + 2 * gridSize < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
		reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
		begin += 2 * gridSize;
		}
		while( begin < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
		begin += gridSize;
		}
		}
		else {
		if( begin < end ) {
		initialValue = dataFetcher( begin );
		initialIndex = begin;
		begin += gridSize;
		}
		while( begin + 4 * gridSize < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
		reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
		reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, begin + 2 * gridSize );
		reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, begin + 3 * gridSize );
		begin += 4 * gridSize;
		}
		while( begin + 2 * gridSize < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
		reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
		begin += 2 * gridSize;
		}
		while( begin < end ) {
		reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
		begin += gridSize;
		}
		}
		__syncthreads();

		// Perform the parallel reduction.
		const std::pair< Result, Index > result = BlockReduce::reduceWithArgument( reduction, initialValue, initialIndex, threadIdx.x, storage );

		// Store the result back in the global memory.
		if( tid == 0 ) {
		output[ blockIdx.x ] = sdata[ 0 ];
		idxOutput[ blockIdx.x ] = sidx[ 0 ];
		if( threadIdx.x == 0 ) {
		output[ blockIdx.x ] = result.first;
		idxOutput[ blockIdx.x ] = result.second;
		}
		}
		#endif