Skip to content
Snippets Groups Projects
Commit 8d5b7fb2 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Deleting legacy constants in CSRAdaptiveKernelView.

parent 99b90ba4
No related branches found
No related tags found
1 merge request!89To/matrices adaptive csr
...@@ -32,8 +32,6 @@ struct CSRAdaptiveKernelView ...@@ -32,8 +32,6 @@ struct CSRAdaptiveKernelView
CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView() = default;
CSRAdaptiveKernelView( BlocksType& blocks );
void setBlocks( BlocksType& blocks, const int idx ); void setBlocks( BlocksType& blocks, const int idx );
ViewType getView(); ViewType getView();
......
...@@ -26,11 +26,7 @@ namespace TNL { ...@@ -26,11 +26,7 @@ namespace TNL {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
template< int warpSize, template< typename BlocksView,
int WARPS,
int SHARED_PER_WARP,
int MAX_ELEM_PER_WARP,
typename BlocksView,
typename Offsets, typename Offsets,
typename Index, typename Index,
typename Fetch, typename Fetch,
...@@ -50,21 +46,19 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -50,21 +46,19 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
Real zero, Real zero,
Args... args ) Args... args )
{ {
static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
//constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpSize = Cuda::getWarpSize();
//constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
//constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
__shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
__shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
constexpr size_t MAX_X_DIM = 2147483647; const Index blockIdx = index / WarpSize;
const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
const Index blockIdx = index / warpSize;
if( blockIdx >= blocks.getSize() - 1 ) if( blockIdx >= blocks.getSize() - 1 )
return; return;
if( threadIdx.x < CudaBlockSize / warpSize ) if( threadIdx.x < CudaBlockSize / WarpSize )
multivectorShared[ threadIdx.x ] = zero; multivectorShared[ threadIdx.x ] = zero;
Real result = zero; Real result = zero;
bool compute( true ); bool compute( true );
...@@ -80,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -80,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
const Index end = begin + block.getSize(); const Index end = begin + block.getSize();
// Stream data to shared memory // Stream data to shared memory
for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
{ {
streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
...@@ -90,7 +84,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -90,7 +84,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
{ {
const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
result = zero; result = zero;
...@@ -105,7 +99,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -105,7 +99,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
const Index end = begin + block.getSize(); const Index end = begin + block.getSize();
const Index segmentIdx = block.getFirstSegment(); const Index segmentIdx = block.getFirstSegment();
for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize )
result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
// Parallel reduction // Parallel reduction
...@@ -163,7 +157,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -163,7 +157,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
// Reduction in multivectorShared // Reduction in multivectorShared
if( block.getWarpIdx() == 0 && laneIdx < 16 ) if( block.getWarpIdx() == 0 && laneIdx < 16 )
{ {
constexpr int totalWarps = CudaBlockSize / warpSize; constexpr int totalWarps = CudaBlockSize / WarpSize;
if( totalWarps >= 32 ) if( totalWarps >= 32 )
{ {
multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
...@@ -199,14 +193,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, ...@@ -199,14 +193,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
} }
#endif #endif
/*template< typename Index,
typename Device >
CSRAdaptiveKernelView< Index, Device >::
CSRAdaptiveKernelView( BlocksType& blocks )
{
this->blocks.bind( blocks );
}*/
template< typename Index, template< typename Index,
typename Device > typename Device >
void void
...@@ -272,54 +258,28 @@ segmentsReduction( const OffsetsView& offsets, ...@@ -272,54 +258,28 @@ segmentsReduction( const OffsetsView& offsets,
return; return;
} }
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
/* Max length of row to process one warp for CSR Light, MultiVector */
//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
/* Max length of row to process one warp for CSR Adaptive */
//static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp();
/* How many shared memory use per block in CSR Adaptive kernel */
static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory();
/* Number of elements in shared memory */
static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
/* Number of warps in block for CSR Adaptive */
static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
/* Number of elements in shared memory per one warp */
static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
constexpr int warpSize = 32;
Index blocksCount; Index blocksCount;
const Index threads = THREADS_ADAPTIVE; const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
constexpr size_t MAX_X_DIM = 2147483647; constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647;
/* Fill blocks */ // Fill blocks
size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block
/* Execute kernels on device */ // Execute kernels on device
for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
{ {
if (MAX_X_DIM * threads >= neededThreads) if( maxGridSize * threads >= neededThreads )
{ {
blocksCount = roundUpDivision(neededThreads, threads); blocksCount = roundUpDivision( neededThreads, threads );
neededThreads = 0; neededThreads = 0;
} }
else else
{ {
blocksCount = MAX_X_DIM; blocksCount = maxGridSize;
neededThreads -= MAX_X_DIM * threads; neededThreads -= maxGridSize * threads;
} }
segmentsReductionCSRAdaptiveKernel< segmentsReductionCSRAdaptiveKernel<
warpSize,
WARPS,
SHARED_PER_WARP,
details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(),
BlocksView, BlocksView,
OffsetsView, OffsetsView,
Index, Fetch, Reduction, ResultKeeper, Real, Args... > Index, Fetch, Reduction, ResultKeeper, Real, Args... >
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment