diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 5ade54d02eedd3e2a5c1b2778bc8e3a215058c2c..22cf447ecb3f4398bebecb6e368339f84083a1ce 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -67,27 +67,6 @@ struct CSRAdaptiveKernel static TNL::String getKernelType(); - - static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; - - // How many shared memory use per block in CSR Adaptive kernel - static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: - - // Number of elements in shared memory - static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); - - // Number of warps in block for CSR Adaptive - static constexpr Index WARPS = THREADS_ADAPTIVE / 32; - - // Number of elements in shared memory per one warp - static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - - // Max length of row to process one warp for CSR Light, MultiVector - static constexpr Index MAX_ELEMENTS_PER_WARP = 384; - - // Max length of row to process one warp for CSR Adaptive - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); - template< typename Offsets > void init( const Offsets& offsets ); diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp index ff2db147be16312e79260fbd2ed4531570e5816f..13c653c6c53e13d2b9c891b7bf9d9e2ae49a2a1d 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); - const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); + const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index 2546580f8f3492460460748f07a28a17ce9dbcdf..56f203a74b501957f733711fb67e7f00efbf1e6d 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -24,7 +24,7 @@ struct CSRAdaptiveKernelParameters * * \return CUDA block size. */ - static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; }; + static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /**