Skip to content
Snippets Groups Projects
Commit e9f25336 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring BiEllapck SpMV CUDA kernel.

parent a92f219d
No related branches found
No related tags found
1 merge request!58To/matrices
...@@ -364,29 +364,25 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red ...@@ -364,29 +364,25 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
if( std::is_same< DeviceType, Devices::Cuda >::value ) if( std::is_same< DeviceType, Devices::Cuda >::value )
{ {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
//printStructure( std::cerr ); constexpr int BlockDim = 256;//getWarpSize();
//for( IndexType i = first; i < last; i += getWarpSize() ) dim3 cudaBlockSize = BlockDim;
const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
IndexType sharedMemory = 0;
if( ! RowMajorOrder )
sharedMemory = cudaBlockSize.x * sizeof( RealType );
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
{ {
//IndexType first = i; dim3 cudaGridSize = Cuda::getMaxGridSize();
//IndexType last = TNL::min( this->getSize(), i + getWarpSize() ); if( gridIdx == cudaGrids - 1 )
constexpr int BlockDim = getWarpSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
dim3 cudaBlockSize = BlockDim; details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >
const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() ); <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x ); ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); cudaThreadSynchronize();
const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); TNL_CHECK_CUDA_DEVICE;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
{
dim3 cudaGridSize = Cuda::getMaxGridSize();
if( gridIdx == cudaGrids - 1 )
cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args... >
<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
cudaThreadSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
} }
#endif #endif
} }
...@@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, ...@@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize(); const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx ); const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
IndexType groupHeight = getWarpSize(); IndexType groupHeight = getWarpSize();
//printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
bool compute( true ); bool compute( true );
IndexType localIdx( 0 ); IndexType localIdx( 0 );
RealType result( zero ); RealType result( zero );
...@@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, ...@@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
{ {
IndexType groupOffset = groupPointers[ groupIdx ]; IndexType groupOffset = groupPointers[ groupIdx ];
const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset; const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
//printf( "groupSize = %d \n", groupSize );
if( groupSize ) if( groupSize )
{ {
const IndexType groupWidth = groupSize / groupHeight; const IndexType groupWidth = groupSize / groupHeight;
for( IndexType i = 0; i < groupWidth; i++ ) for( IndexType i = 0; i < groupWidth; i++ )
{ {
if( RowMajorOrder ) if( RowMajorOrder )
{
reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) ); reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
}
else else
{
/*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
groupIdx, groupSize, groupWidth );*/
reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) ); reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
}
localIdx++; localIdx++;
} }
} }
...@@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx, ...@@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx,
IndexType groupHeight = getWarpSize(); IndexType groupHeight = getWarpSize();
IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 ); IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );
RealType* temp( nullptr );
if( ! RowMajorOrder )
temp = Cuda::getSharedMemory< RealType >();
__shared__ RealType results[ BlockDim ]; __shared__ RealType results[ BlockDim ];
results[ threadIdx.x ] = zero; results[ threadIdx.x ] = zero;
__shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ]; __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
...@@ -571,35 +555,42 @@ segmentsReductionKernel( IndexType gridIdx, ...@@ -571,35 +555,42 @@ segmentsReductionKernel( IndexType gridIdx,
if( threadIdx.x <= getLogWarpSize() + 1 ) if( threadIdx.x <= getLogWarpSize() + 1 )
sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ]; sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupIdx + threadIdx.x ];
__syncthreads(); __syncthreads();
bool compute( true ); bool compute( true );
for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) if( RowMajorOrder )
{ {
IndexType groupBegin = sharedGroupPointers[ group ]; for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
IndexType groupEnd = sharedGroupPointers[ group + 1 ];
if( groupEnd - groupBegin > 0 )
{ {
if( RowMajorOrder ) IndexType groupBegin = sharedGroupPointers[ group ];
IndexType groupEnd = sharedGroupPointers[ group + 1 ];
if( groupEnd - groupBegin > 0 )
{ {
if( inWarpIdx < groupHeight )
{ if( inWarpIdx < groupHeight )
const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight; {
IndexType globalIdx = groupBegin + inWarpIdx * groupWidth; const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
for( IndexType i = 0; i < groupWidth && compute; i++ ) IndexType globalIdx = groupBegin + inWarpIdx * groupWidth;
reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) ); for( IndexType i = 0; i < groupWidth && compute; i++ )
reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
}
} }
} groupHeight >>= 1;
else }
}
else
{
RealType* temp = Cuda::getSharedMemory< RealType >();
for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
{
IndexType groupBegin = sharedGroupPointers[ group ];
IndexType groupEnd = sharedGroupPointers[ group + 1 ];
if( groupEnd - groupBegin > 0 )
{ {
temp[ threadIdx.x ] = zero; temp[ threadIdx.x ] = zero;
IndexType globalIdx = groupBegin + inWarpIdx; IndexType globalIdx = groupBegin + inWarpIdx;
while( globalIdx < groupEnd ) while( globalIdx < groupEnd )
{ {
reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) ); reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
/*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n",
globalIdx,
( int ) fetch( globalIdx, compute ),
( int ) temp[ threadIdx.x ], groupEnd );*/
globalIdx += getWarpSize(); globalIdx += getWarpSize();
} }
// TODO: reduction via templates // TODO: reduction via templates
...@@ -613,8 +604,8 @@ segmentsReductionKernel( IndexType gridIdx, ...@@ -613,8 +604,8 @@ segmentsReductionKernel( IndexType gridIdx,
if( inWarpIdx < groupHeight ) if( inWarpIdx < groupHeight )
reduction( results[ threadIdx.x ], temp[ threadIdx.x ] ); reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
} }
groupHeight >>= 1;
} }
groupHeight >>= 1;
} }
__syncthreads(); __syncthreads();
if( warpStart + inWarpIdx >= last ) if( warpStart + inWarpIdx >= last )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment