From 0785c1dac5e0b6dad7f1ee39eccca92696aaf1bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Wed, 10 Feb 2021 15:50:44 +0100 Subject: [PATCH] Adaptive CSR kernels adapts to different Value/Real types. --- .../Algorithms/Segments/CSRAdaptiveKernel.h | 2 + .../Algorithms/Segments/CSRAdaptiveKernel.hpp | 7 +--- .../Segments/CSRAdaptiveKernelView.h | 5 ++- .../Segments/CSRAdaptiveKernelView.hpp | 28 +++---------- .../details/CSRAdaptiveKernelParameters.h | 41 +++++++++++++++++-- 5 files changed, 51 insertions(+), 32 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 22cf447ecb..d6c3f2b92d 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -65,6 +65,8 @@ struct CSRAdaptiveKernel static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; + static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; + static TNL::String getKernelType(); template< typename Offsets > diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp index 13c653c6c5..7bcb66c28c 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); - const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); + const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; @@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); - //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; - TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); - TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); - this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; + this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks; } } // namespace Segments diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h index 113008ad08..b81d360278 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h @@ -12,6 +12,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> +#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { @@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; - static constexpr int MaxValueSizeLog = 6; + static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog; + + static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; CSRAdaptiveKernelView() = default; diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 427b5eba7b..2ddfcd65c8 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } else // blockType == Type::LONG - several warps per segment { - // Number of elements processed by previous warps - //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; - /// block.getWarpIdx() * MAX_ELEM_PER_WARP; - //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; - //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; - //const int tid = threadIdx.x; - //const int inBlockWarpIdx = block.getWarpIdx(); - //if( to > end ) - // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; - //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); - //if( laneIdx == 0 ) - // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, - // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); - //result += values[i] * inVector[columnIndexes[i]]; } - //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); @@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); - //if( laneIdx == 0 ) - // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); - const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; @@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA - int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); + int valueSizeLog = getSizeValueLog( sizeof( Real ) ); - if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) + if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); @@ -261,11 +243,11 @@ segmentsReduction( const OffsetsView& offsets, Index blocksCount; const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); - constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647; + constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); - // Fill blocks + // Fill blocks size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block - // Execute kernels on device + // Execute kernels on device for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if( maxGridSize * threads >= neededThreads ) diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index 56f203a74b..3fa0855cb6 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -15,17 +15,26 @@ namespace TNL { namespace Segments { namespace details { -template< int SizeOfValue, +static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; + +template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { + static constexpr int MaxValueSizeLog = 6; + + static constexpr int getSizeValueLogConstexpr( const int i ); + + static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); + static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); + /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ - static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; - //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; + static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; }; + //{ return SizeOfValue == 8 ? 128 : 256; }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. @@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; + + static int getSizeValueLog( const int i ) + { + if( i == 1 ) return 0; + if( i == 2 ) return 1; + if( i <= 4 ) return 2; + if( i <= 8 ) return 3; + if( i <= 16 ) return 4; + return 5; + } +}; + + +template< int SizeOfValue, + int StreamedSharedMemory_ > +constexpr int +CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >:: +getSizeValueLogConstexpr( const int i ) +{ + if( i == 1 ) return 0; + if( i == 2 ) return 1; + if( i <= 4 ) return 2; + if( i <= 8 ) return 3; + if( i <= 16 ) return 4; + if( i <= 32 ) return 5; + return 6; }; } // namespace details -- GitLab