Skip to content
Snippets Groups Projects
Commit 440c35b9 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added blocks in CSR adaptive kernel for different Value/Real type sizes.

parent 1da0935b
No related branches found
No related tags found
1 merge request!89To/matrices adaptive csr
......@@ -63,13 +63,15 @@ struct CSRAdaptiveKernel
using BlocksType = typename ViewType::BlocksType;
using BlocksView = typename BlocksType::ViewType;
static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };
static TNL::String getKernelType();
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
// How many shared memory use per block in CSR Adaptive kernel
static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO:
// Number of elements in shared memory
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
......@@ -84,14 +86,7 @@ struct CSRAdaptiveKernel
static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
// Max length of row to process one warp for CSR Adaptive
static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();
template< typename Offsets >
Index findLimit( const Index start,
const Offsets& offsets,
const Index size,
details::Type &type,
Index &sum );
static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp();
template< typename Offsets >
void init( const Offsets& offsets );
......@@ -118,7 +113,21 @@ struct CSRAdaptiveKernel
Args... args ) const;
protected:
BlocksType blocks;
template< int SizeOfValue, typename Offsets >
Index findLimit( const Index start,
const Offsets& offsets,
const Index size,
details::Type &type,
Index &sum );
template< int SizeOfValue,
typename Offsets >
void initValueSize( const Offsets& offsets );
/**
* \brief blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i.
*/
BlocksType blocksArray[ MaxValueSizeLog() ];
ViewType view;
};
......
......@@ -31,10 +31,81 @@ getKernelType()
return ViewType::getKernelType();
};
template< typename Index,
typename Device >
template< typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
{
this->template initValueSize< 1 >( offsets );
this->template initValueSize< 2 >( offsets );
this->template initValueSize< 4 >( offsets );
this->template initValueSize< 8 >( offsets );
this->template initValueSize< 16 >( offsets );
this->template initValueSize< 32 >( offsets );
for( int i = 0; i < MaxValueSizeLog(); i++ )
this->view.setBlocks( blocksArray[ i ], i );
}
template< typename Index,
typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
for( int i = 0; i < MaxValueSizeLog(); i++ )
{
this->blocksArray[ i ].reset();
this->view.setBlocks( this->blocksArray[ i ], i );
}
}
template< typename Index,
typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
return this->view;
}
template< typename Index,
typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
return this->view;
};
template< typename Index,
typename Device >
template< typename OffsetsView,
typename Fetch,
typename Reduction,
typename ResultKeeper,
typename Real,
typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
Index first,
Index last,
Fetch& fetch,
const Reduction& reduction,
ResultKeeper& keeper,
const Real& zero,
Args... args ) const
{
view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
}
template< typename Index,
typename Device >
template< int SizeOfValue,
typename Offsets >
Index
CSRAdaptiveKernel< Index, Device >::
findLimit( const Index start,
......@@ -48,7 +119,7 @@ findLimit( const Index start,
{
Index elements = offsets[ current + 1 ] - offsets[ current ];
sum += elements;
if( sum > SHARED_PER_WARP )
if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
{
if( current - start > 0 ) // extra row
{
......@@ -57,7 +128,7 @@ findLimit( const Index start,
}
else
{ // one long row
if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
type = details::Type::VECTOR;
else
type = details::Type::LONG;
......@@ -67,14 +138,15 @@ findLimit( const Index start,
}
type = details::Type::STREAM;
return size - 1; // return last row pointer
}
}
template< typename Index,
typename Device >
template< typename Offsets >
template< int SizeOfValue,
typename Offsets >
void
CSRAdaptiveKernel< Index, Device >::
init( const Offsets& offsets )
initValueSize( const Offsets& offsets )
{
using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
HostOffsetsType hostOffsets( offsets );
......@@ -88,7 +160,7 @@ init( const Offsets& offsets )
while( nextStart != rows - 1 )
{
details::Type type;
nextStart = findLimit( start, hostOffsets, rows, type, sum );
nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum );
if( type == details::Type::LONG )
{
......@@ -110,58 +182,10 @@ init( const Offsets& offsets )
start = nextStart;
}
inBlocks.emplace_back(nextStart);
this->blocks = inBlocks;
this->view.setBlocks( blocks );
}
template< typename Index,
typename Device >
void
CSRAdaptiveKernel< Index, Device >::
reset()
{
this->blocks.reset();
this->view.setBlocks( blocks );
}
template< typename Index,
typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getView() -> ViewType
{
return this->view;
}
template< typename Index,
typename Device >
auto
CSRAdaptiveKernel< Index, Device >::
getConstView() const -> ConstViewType
{
return this->view;
};
template< typename Index,
typename Device >
template< typename OffsetsView,
typename Fetch,
typename Reduction,
typename ResultKeeper,
typename Real,
typename... Args >
void
CSRAdaptiveKernel< Index, Device >::
segmentsReduction( const OffsetsView& offsets,
Index first,
Index last,
Fetch& fetch,
const Reduction& reduction,
ResultKeeper& keeper,
const Real& zero,
Args... args ) const
{
view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
//std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
}
} // namespace Segments
......
......@@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView
using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
using BlocksView = typename BlocksType::ViewType;
static constexpr int MaxValueSizeLog = 6;
CSRAdaptiveKernelView() = default;
CSRAdaptiveKernelView( BlocksType& blocks );
void setBlocks( BlocksType& blocks );
void setBlocks( BlocksType& blocks, const int idx );
ViewType getView();
......@@ -57,14 +59,14 @@ struct CSRAdaptiveKernelView
CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView );
void printBlocks() const;
void printBlocks( int idx ) const;
protected:
BlocksView blocks;
BlocksView blocksArray[ MaxValueSizeLog ];
};
} // namespace Segments
} // namespace Algorithms
} // namespace TNL
#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp>
\ No newline at end of file
#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp>
......@@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
Real zero,
Args... args )
{
static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
constexpr int WarpSize = Cuda::getWarpSize();
constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();
static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
//constexpr int WarpSize = Cuda::getWarpSize();
//constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
//constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
......@@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
}
#endif
template< typename Index,
/*template< typename Index,
typename Device >
CSRAdaptiveKernelView< Index, Device >::
CSRAdaptiveKernelView( BlocksType& blocks )
{
this->blocks.bind( blocks );
}
}*/
template< typename Index,
typename Device >
void
CSRAdaptiveKernelView< Index, Device >::
setBlocks( BlocksType& blocks )
setBlocks( BlocksType& blocks, const int idx )
{
this->blocks.bind( blocks );
this->blocksArray[ idx ].bind( blocks );
}
template< typename Index,
......@@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets,
Args... args ) const
{
#ifdef HAVE_CUDA
if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );
if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog )
{
TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
return;
}
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
/* Max length of row to process one warp for CSR Light, MultiVector */
//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
/* Max length of row to process one warp for CSR Adaptive */
static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
//static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp();
/* How many shared memory use per block in CSR Adaptive kernel */
static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory();
/* Number of elements in shared memory */
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
......@@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets,
constexpr size_t MAX_X_DIM = 2147483647;
/* Fill blocks */
size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block
/* Execute kernels on device */
for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
{
......@@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets,
warpSize,
WARPS,
SHARED_PER_WARP,
details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(),
BlocksView,
OffsetsView,
Index, Fetch, Reduction, ResultKeeper, Real, Args... >
<<<blocksCount, threads>>>(
this->blocks,
this->blocksArray[ valueSizeLog ],
gridIdx,
offsets,
first,
......@@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >&
CSRAdaptiveKernelView< Index, Device >::
operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
{
this->blocks.bind( kernelView.blocks );
for( int i = 0; i < MaxValueSizeLog; i++ )
this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] );
return *this;
}
......@@ -350,8 +353,9 @@ template< typename Index,
typename Device >
void
CSRAdaptiveKernelView< Index, Device >::
printBlocks() const
printBlocks( int idx ) const
{
auto& blocks = this->blocksArray[ idx ];
for( Index i = 0; i < this->blocks.getSize(); i++ )
{
auto block = blocks.getElement( i );
......
......@@ -15,7 +15,7 @@ namespace TNL {
namespace Segments {
namespace details {
template< typename Value,
template< int SizeOfValue,
int StreamedSharedMemory_ = 24576 >
struct CSRAdaptiveKernelParameters
{
......@@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters
/**
* \brief Number of elements fitting into streamed shared memory.
*/
static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };
static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; };
/**
* \brief Computes number of warps in one CUDA block.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment