Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +2 −0 Original line number Diff line number Diff line Loading @@ -65,6 +65,8 @@ struct CSRAdaptiveKernel static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; static TNL::String getKernelType(); template< typename Offsets > Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +2 −5 Original line number Diff line number Diff line Loading @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; Loading @@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks; } } // namespace Segments Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +4 −1 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading @@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog = 6; static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog; static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; CSRAdaptiveKernelView() = default; Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +5 −23 Original line number Diff line number Diff line Loading @@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); Loading @@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; Loading Loading @@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); int valueSizeLog = getSizeValueLog( sizeof( Real ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); Loading @@ -261,7 +243,7 @@ segmentsReduction( const OffsetsView& offsets, Index blocksCount; const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647; constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); // Fill blocks size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +38 −3 Original line number Diff line number Diff line Loading @@ -15,17 +15,26 @@ namespace TNL { namespace Segments { namespace details { template< int SizeOfValue, static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { static constexpr int MaxValueSizeLog = 6; static constexpr int getSizeValueLogConstexpr( const int i ); static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; }; //{ return SizeOfValue == 8 ? 128 : 256; }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. Loading Loading @@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; static int getSizeValueLog( const int i ) { if( i == 1 ) return 0; if( i == 2 ) return 1; if( i <= 4 ) return 2; if( i <= 8 ) return 3; if( i <= 16 ) return 4; return 5; } }; template< int SizeOfValue, int StreamedSharedMemory_ > constexpr int CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >:: getSizeValueLogConstexpr( const int i ) { if( i == 1 ) return 0; if( i == 2 ) return 1; if( i <= 4 ) return 2; if( i <= 8 ) return 3; if( i <= 16 ) return 4; if( i <= 32 ) return 5; return 6; }; } // namespace details Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +2 −0 Original line number Diff line number Diff line Loading @@ -65,6 +65,8 @@ struct CSRAdaptiveKernel static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; static TNL::String getKernelType(); template< typename Offsets > Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +2 −5 Original line number Diff line number Diff line Loading @@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets ) if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; Loading @@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks; } } // namespace Segments Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +4 −1 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading @@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog = 6; static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog; static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); }; CSRAdaptiveKernelView() = default; Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +5 −23 Original line number Diff line number Diff line Loading @@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); Loading @@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; Loading Loading @@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); int valueSizeLog = getSizeValueLog( sizeof( Real ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); Loading @@ -261,7 +243,7 @@ segmentsReduction( const OffsetsView& offsets, Index blocksCount; const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647; constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); // Fill blocks size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +38 −3 Original line number Diff line number Diff line Loading @@ -15,17 +15,26 @@ namespace TNL { namespace Segments { namespace details { template< int SizeOfValue, static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { static constexpr int MaxValueSizeLog = 6; static constexpr int getSizeValueLogConstexpr( const int i ); static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; }; //std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; }; //{ return SizeOfValue == 8 ? 128 : 256; }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. Loading Loading @@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; static int getSizeValueLog( const int i ) { if( i == 1 ) return 0; if( i == 2 ) return 1; if( i <= 4 ) return 2; if( i <= 8 ) return 3; if( i <= 16 ) return 4; return 5; } }; template< int SizeOfValue, int StreamedSharedMemory_ > constexpr int CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >:: getSizeValueLogConstexpr( const int i ) { if( i == 1 ) return 0; if( i == 2 ) return 1; if( i <= 4 ) return 2; if( i <= 8 ) return 3; if( i <= 16 ) return 4; if( i <= 32 ) return 5; return 6; }; } // namespace details Loading