Skip to content
Snippets Groups Projects
Commit 936f2c32 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring of adaptive CSR kernel.

parent 9b08c76c
No related branches found
No related tags found
1 merge request!89To/matrices adaptive csr
......@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
__shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
//__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
//__shared__ BlockType sharedBlocks[ WarpsCount ];
__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
__shared__ BlockType sharedBlocks[ WarpsCount ];
const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
const Index blockIdx = index / WarpSize;
if( blockIdx >= blocks.getSize() - 1 )
return;
//if( threadIdx.x < CudaBlockSize / WarpSize )
// multivectorShared[ threadIdx.x ] = zero;
if( threadIdx.x < CudaBlockSize / WarpSize )
multivectorShared[ threadIdx.x ] = zero;
Real result = zero;
bool compute( true );
const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
......@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
__syncthreads();
const auto& block = sharedBlocks[ warpIdx ];*/
const BlockType block = blocks[ blockIdx ];
const Index begin = offsets[ block.getFirstSegment() ];
const Index firstSegmentIdx = block.getFirstSegment();
const Index begin = offsets[ firstSegmentIdx ];
if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
{
......@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
// Stream data to shared memory
for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
{
streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
}
//const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
/*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
{
const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
result = zero;
......@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
keep( i, result );
}*/
}
}
/*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
{
const Index end = begin + block.getSize();
const Index segmentIdx = block.getFirstSegment();
......@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
keep( segmentIdx, multivectorShared[ 0 ] );
}
}
}*/
}
}
#endif
......
......@@ -22,11 +22,13 @@ enum class Type {
VECTOR = 2
};
//#define CSR_ADAPTIVE_UNION
#ifdef CSR_ADAPTIVE_UNION
template< typename Index >
union CSRAdaptiveKernelBlockDescriptor
{
CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
{
this->index[0] = row;
this->index[1] = index;
......@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
}
__cuda_callable__ uint8_t getWarpIdx() const
{
return index[ 1 ];
}
__cuda_callable__ uint8_t getWarpsCount() const
{
return 1;
}
void print( std::ostream& str ) const
{
Type type = this->getType();
......
......@@ -15,7 +15,7 @@ namespace TNL {
namespace Segments {
namespace details {
static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };
template< int SizeOfValue = 1,
int StreamedSharedMemory_ = 24576 >
......@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters
static constexpr int getSizeValueLogConstexpr( const int i );
static constexpr int getSizeOfValue() { return SizeOfValue; };
static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );
/**
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment