Newer
Older
/***************************************************************************
CudaReductionKernel.h - description
-------------------
begin : Jun 17, 2015
copyright : (C) 2015 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#ifdef HAVE_CUDA
#include <cuda.h>
#endif
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
namespace Containers {
namespace Algorithms {
/****
* The performance of this kernel is very sensitive to register usage.
* Compile with --ptxas-options=-v and configure these constants for given
* architecture so that there are no local memory spills.
*/
static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
#if (__CUDA_ARCH__ >= 300 )
static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#else
static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#endif
template< typename Operation, int blockSize >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( Operation operation,
const typename Operation::IndexType size,
const typename Operation::RealType* input1,
const typename Operation::RealType* input2,
typename Operation::ResultType* output )
typedef typename Operation::IndexType IndexType;
typedef typename Operation::ResultType ResultType;
ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
/***
* Get thread id (tid) and global thread id (gid).
* gridSize is the number of element processed by all blocks at the
* same time.
*/
const IndexType tid = threadIdx.x;
IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
const IndexType gridSize = blockDim.x * gridDim.x;
sdata[ tid ] = operation.initialValue();
/***
* Read data into the shared memory. We start with the
* sequential reduction.
*/
while( gid + 4 * gridSize < size )
{
operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 );
operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
}
while( gid + 2 * gridSize < size )
{
operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
operation.cudaFirstReduction( sdata[ tid ], gid + gridSize, input1, input2 );
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
}
while( gid < size )
{
operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 );
gid += gridSize;
}
__syncthreads();
//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
//return;
/***
* Perform the parallel reduction.
*/
if( blockSize >= 1024 )
{
if( tid < 512 )
operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
__syncthreads();
}
if( blockSize >= 512 )
{
if( tid < 256 )
operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
__syncthreads();
}
if( blockSize >= 256 )
{
if( tid < 128 )
operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
__syncthreads();
//printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
}
if( blockSize >= 128 )
{
if( tid < 64 )
operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
__syncthreads();
//printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
}
/***
* This runs in one warp so it is synchronized implicitly.
if( tid < 32 )
{
volatile ResultType* vsdata = sdata;
if( blockSize >= 64 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
//printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
}
// TODO: If blocksize == 32, the following does not work
// We do not check if tid < 16. Fix it!!!
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
if( blockSize >= 32 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
//printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
}
if( blockSize >= 16 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
//printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
}
if( blockSize >= 8 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
//printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
}
if( blockSize >= 4 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
//printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
}
if( blockSize >= 2 )
{
operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
//printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
}
}
/***
* Store the result back in the global memory.
*/
if( tid == 0 )
{
//printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
output[ blockIdx.x ] = sdata[ 0 ];
template< typename Operation >
typename Operation::IndexType
CudaReductionKernelLauncher( Operation& operation,
const typename Operation::IndexType size,
const typename Operation::RealType* input1,
const typename Operation::RealType* input2,
typename Operation::ResultType*& output )
{
typedef typename Operation::IndexType IndexType;
typedef typename Operation::RealType RealType;
typedef typename Operation::ResultType ResultType;
// The number of blocks should be a multiple of the number of multiprocessors
// to ensure optimum balancing of the load. This is very important, because
// we run the kernel with a fixed number of blocks, so the amount of work per
// block increases with enlarging the problem, so even small imbalance can
// cost us dearly.
// On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
// const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
// On Tesla K40c, desGridSize = 6 * 15 = 90.
const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
dim3 blockSize( 256 ), gridSize( 0 );
gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
// create reference to the reduction buffer singleton and set size
const size_t buf_size = desGridSize * sizeof( ResultType );
CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
if( ! cudaReductionBuffer.setSize( buf_size ) )
throw 1;
output = cudaReductionBuffer.template getData< ResultType >();
// when there is only one warp per blockSize.x, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
const IndexType shmem = (blockSize.x <= 32)
? 2 * blockSize.x * sizeof( ResultType )
: blockSize.x * sizeof( ResultType );
/***
* Depending on the blockSize we generate appropriate template instance.
*/
switch( blockSize.x )
{
case 512:
CudaReductionKernel< Operation, 512 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 256:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 256 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 128:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 128 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 64:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 64 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 64 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 32:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 32 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 32 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 16:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 16 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 16 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 8:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 8 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 8 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 4:
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 4 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 4 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 2 >, cudaFuncCachePreferShared);
CudaReductionKernel< Operation, 2 >
<<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
break;
case 1:
TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
checkCudaDevice;
// return the size of the output array on the CUDA device
return gridSize.x;
} // namespace Algorithms
} // namespace Containers