Skip to content
Snippets Groups Projects
Commit 25e55913 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Added CSRAdaptiveKernelParameters.

parent 91806e1d
No related branches found
No related tags found
1 merge request!89To/matrices adaptive csr
......@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel
static TNL::String getKernelType();
static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
/* How many shared memory use per block in CSR Adaptive kernel */
static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
......
......@@ -18,6 +18,7 @@
#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
namespace TNL {
namespace Algorithms {
......@@ -25,8 +26,7 @@ namespace TNL {
#ifdef HAVE_CUDA
template< int CudaBlockSize,
int warpSize,
template< int warpSize,
int WARPS,
int SHARED_PER_WARP,
int MAX_ELEM_PER_WARP,
......@@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
Real zero,
Args... args )
{
static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
constexpr int WarpSize = Cuda::getWarpSize();
constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();
__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
__shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
constexpr size_t MAX_X_DIM = 2147483647;
......@@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets,
return;
}
static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
//static constexpr Index THREADS_SCALAR = 128;
//static constexpr Index THREADS_VECTOR = 128;
//static constexpr Index THREADS_LIGHT = 128;
static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
/* Max length of row to process one warp for CSR Light, MultiVector */
//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
......@@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets,
}
segmentsReductionCSRAdaptiveKernel<
THREADS_ADAPTIVE,
warpSize,
WARPS,
SHARED_PER_WARP,
......
/***************************************************************************
CSRAdaptiveKernelBlockDescriptor.h - description
-------------------
begin : Jan 25, 2021 -> Joe Biden inauguration
copyright : (C) 2021 by Tomas Oberhuber
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
namespace TNL {
namespace Algorithms {
namespace Segments {
namespace details {
template< typename Value >
struct CSRAdaptiveKernelParameters
{
static const int StreamedSharedMemory_ = 20000;
/**
* \brief Computes number of CUDA threads per block depending on Value type.
*
* \return CUDA block size.
*/
static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; };
//std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };
/**
* \brief Returns amount of shared memory dedicated for stream CSR kernel.
*
* \return Stream shared memory.
*/
static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };
/**
* \brief Number of elements fitting into streamed shared memory.
*/
static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };
/**
* \brief Computes number of warps in one CUDA block.
*/
static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };
/**
* \brief Computes number of elements to be streamed into the shared memory.
*
* \return Number of elements to be streamed into the shared memory.
*/
static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };
/**
* \brief Returns maximum number of elements per warp for vector and hybrid kernel.
*
* \return Maximum number of elements per warp for vector and hybrid kernel.
*/
static constexpr int MaxVectorElementsPerWarp() { return 384; };
/**
* \brief Returns maximum number of elements per warp for adaptive kernel.
*
* \return Maximum number of elements per warp for adaptive kernel.
*/
static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
};
} // namespace details
} // namespace Segments
} // namespace Algorithms
} // namespace TNL
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment