Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +11 −12 Original line number Diff line number Diff line Loading @@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; //__shared__ BlockType sharedBlocks[ WarpsCount ]; __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; __shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; //if( threadIdx.x < CudaBlockSize / WarpSize ) // multivectorShared[ threadIdx.x ] = zero; if( threadIdx.x < CudaBlockSize / WarpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % Loading @@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __syncthreads(); const auto& block = sharedBlocks[ warpIdx ];*/ const BlockType block = blocks[ blockIdx ]; const Index begin = offsets[ block.getFirstSegment() ]; const Index firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp { Loading @@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute ); } //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; Loading @@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); }*/ } /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp } else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); Loading Loading @@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } }*/ } } #endif Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +13 −1 Original line number Diff line number Diff line Loading @@ -22,11 +22,13 @@ enum class Type { VECTOR = 2 }; //#define CSR_ADAPTIVE_UNION #ifdef CSR_ADAPTIVE_UNION template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept { this->index[0] = row; this->index[1] = index; Loading Loading @@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } __cuda_callable__ uint8_t getWarpIdx() const { return index[ 1 ]; } __cuda_callable__ uint8_t getWarpsCount() const { return 1; } void print( std::ostream& str ) const { Type type = this->getType(); Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +4 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 }; template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > Loading @@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters static constexpr int getSizeValueLogConstexpr( const int i ); static constexpr int getSizeOfValue() { return SizeOfValue; }; static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); /** Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +11 −12 Original line number Diff line number Diff line Loading @@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; //__shared__ BlockType sharedBlocks[ WarpsCount ]; __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; __shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; //if( threadIdx.x < CudaBlockSize / WarpSize ) // multivectorShared[ threadIdx.x ] = zero; if( threadIdx.x < CudaBlockSize / WarpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % Loading @@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __syncthreads(); const auto& block = sharedBlocks[ warpIdx ];*/ const BlockType block = blocks[ blockIdx ]; const Index begin = offsets[ block.getFirstSegment() ]; const Index firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp { Loading @@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute ); } //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; Loading @@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); }*/ } /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp } else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); Loading Loading @@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } }*/ } } #endif Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +13 −1 Original line number Diff line number Diff line Loading @@ -22,11 +22,13 @@ enum class Type { VECTOR = 2 }; //#define CSR_ADAPTIVE_UNION #ifdef CSR_ADAPTIVE_UNION template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept { this->index[0] = row; this->index[1] = index; Loading Loading @@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } __cuda_callable__ uint8_t getWarpIdx() const { return index[ 1 ]; } __cuda_callable__ uint8_t getWarpsCount() const { return 1; } void print( std::ostream& str ) const { Type type = this->getType(); Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +4 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 }; template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > Loading @@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters static constexpr int getSizeValueLogConstexpr( const int i ); static constexpr int getSizeOfValue() { return SizeOfValue; }; static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); /** Loading