Refactoring BiEllapck SpMV CUDA kernel. (e9f25336) · Commits · TNL / tnl-dev

src/TNL/Containers/Segments/BiEllpackView.hpp

+44 −53

Original line number	Diff line number	Diff line
		@@ -364,17 +364,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		#ifdef HAVE_CUDA
		//printStructure( std::cerr );
		//for( IndexType i = first; i < last; i += getWarpSize() )
		{
		//IndexType first = i;
		//IndexType last = TNL::min( this->getSize(), i + getWarpSize() );
		constexpr int BlockDim = getWarpSize();
		constexpr int BlockDim = 256;//getWarpSize();
		dim3 cudaBlockSize = BlockDim;
		const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() );
		const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
		const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
		IndexType sharedMemory = 0;
		if( ! RowMajorOrder )
		sharedMemory = cudaBlockSize.x * sizeof( RealType );

		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		@@ -387,7 +384,6 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
		cudaThreadSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		}
		#endif
		}
		}
		@@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
		const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
		const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
		IndexType groupHeight = getWarpSize();
		//printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
		bool compute( true );
		IndexType localIdx( 0 );
		RealType result( zero );
		@@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
		{
		IndexType groupOffset = groupPointers[ groupIdx ];
		const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset;
		//printf( "groupSize = %d \n", groupSize );
		if( groupSize )
		{
		const IndexType groupWidth = groupSize / groupHeight;
		for( IndexType i = 0; i < groupWidth; i++ )
		{
		if( RowMajorOrder )
		{
		reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) );
		}
		else
		{
		/*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n",
		segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight,
		groupIdx, groupSize, groupWidth );*/
		reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) );
		}
		localIdx++;
		}
		}
		@@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx,
		IndexType groupHeight = getWarpSize();
		IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 );

		RealType* temp( nullptr );
		if( ! RowMajorOrder )
		temp = Cuda::getSharedMemory< RealType >();
		__shared__ RealType results[ BlockDim ];
		results[ threadIdx.x ] = zero;
		__shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ];
		@@ -573,14 +557,15 @@ segmentsReductionKernel( IndexType gridIdx,
		__syncthreads();

		bool compute( true );
		if( RowMajorOrder )
		{
		for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
		{
		IndexType groupBegin = sharedGroupPointers[ group ];
		IndexType groupEnd = sharedGroupPointers[ group + 1 ];
		if( groupEnd - groupBegin > 0 )
		{
		if( RowMajorOrder )
		{

		if( inWarpIdx < groupHeight )
		{
		const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight;
		@@ -589,17 +574,23 @@ segmentsReductionKernel( IndexType gridIdx,
		reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) );
		}
		}
		groupHeight >>= 1;
		}
		}
		else
		{
		RealType* temp = Cuda::getSharedMemory< RealType >();
		for( IndexType group = 0; group < getLogWarpSize() + 1; group++ )
		{
		IndexType groupBegin = sharedGroupPointers[ group ];
		IndexType groupEnd = sharedGroupPointers[ group + 1 ];
		if( groupEnd - groupBegin > 0 )
		{
		temp[ threadIdx.x ] = zero;
		IndexType globalIdx = groupBegin + inWarpIdx;
		while( globalIdx < groupEnd )
		{
		reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
		/*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n",
		globalIdx,
		( int ) fetch( globalIdx, compute ),
		( int ) temp[ threadIdx.x ], groupEnd );*/
		globalIdx += getWarpSize();
		}
		// TODO: reduction via templates
		@@ -613,9 +604,9 @@ segmentsReductionKernel( IndexType gridIdx,
		if( inWarpIdx < groupHeight )
		reduction( results[ threadIdx.x ], temp[ threadIdx.x ] );
		}
		}
		groupHeight >>= 1;
		}
		}
		__syncthreads();
		if( warpStart + inWarpIdx >= last )
		return;