Loading src/TNL/Containers/Segments/BiEllpackView.hpp +44 −53 Original line number Diff line number Diff line Loading @@ -364,17 +364,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA //printStructure( std::cerr ); //for( IndexType i = first; i < last; i += getWarpSize() ) { //IndexType first = i; //IndexType last = TNL::min( this->getSize(), i + getWarpSize() ); constexpr int BlockDim = getWarpSize(); constexpr int BlockDim = 256;//getWarpSize(); dim3 cudaBlockSize = BlockDim; const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() ); const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); IndexType sharedMemory = 0; if( ! RowMajorOrder ) sharedMemory = cudaBlockSize.x * sizeof( RealType ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { Loading @@ -387,7 +384,6 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } } #endif } } Loading Loading @@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize(); const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx ); IndexType groupHeight = getWarpSize(); //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount ); bool compute( true ); IndexType localIdx( 0 ); RealType result( zero ); Loading @@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, { IndexType groupOffset = groupPointers[ groupIdx ]; const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset; //printf( "groupSize = %d \n", groupSize ); if( groupSize ) { const IndexType groupWidth = groupSize / groupHeight; for( IndexType i = 0; i < groupWidth; i++ ) { if( RowMajorOrder ) { reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) ); } else { /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n", segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, groupIdx, groupSize, groupWidth );*/ reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) ); } localIdx++; } } Loading Loading @@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx, IndexType groupHeight = getWarpSize(); IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 ); RealType* temp( nullptr ); if( ! RowMajorOrder ) temp = Cuda::getSharedMemory< RealType >(); __shared__ RealType results[ BlockDim ]; results[ threadIdx.x ] = zero; __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ]; Loading @@ -573,14 +557,15 @@ segmentsReductionKernel( IndexType gridIdx, __syncthreads(); bool compute( true ); if( RowMajorOrder ) { for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; if( groupEnd - groupBegin > 0 ) { if( RowMajorOrder ) { if( inWarpIdx < groupHeight ) { const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight; Loading @@ -589,17 +574,23 @@ segmentsReductionKernel( IndexType gridIdx, reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) ); } } groupHeight >>= 1; } } else { RealType* temp = Cuda::getSharedMemory< RealType >(); for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; if( groupEnd - groupBegin > 0 ) { temp[ threadIdx.x ] = zero; IndexType globalIdx = groupBegin + inWarpIdx; while( globalIdx < groupEnd ) { reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) ); /*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n", globalIdx, ( int ) fetch( globalIdx, compute ), ( int ) temp[ threadIdx.x ], groupEnd );*/ globalIdx += getWarpSize(); } // TODO: reduction via templates Loading @@ -613,9 +604,9 @@ segmentsReductionKernel( IndexType gridIdx, if( inWarpIdx < groupHeight ) reduction( results[ threadIdx.x ], temp[ threadIdx.x ] ); } } groupHeight >>= 1; } } __syncthreads(); if( warpStart + inWarpIdx >= last ) return; Loading Loading
src/TNL/Containers/Segments/BiEllpackView.hpp +44 −53 Original line number Diff line number Diff line Loading @@ -364,17 +364,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA //printStructure( std::cerr ); //for( IndexType i = first; i < last; i += getWarpSize() ) { //IndexType first = i; //IndexType last = TNL::min( this->getSize(), i + getWarpSize() ); constexpr int BlockDim = getWarpSize(); constexpr int BlockDim = 256;//getWarpSize(); dim3 cudaBlockSize = BlockDim; const IndexType stripsCount = roundUpDivision( last - first, getWarpSize() ); const IndexType cudaBlocks = roundUpDivision( stripsCount * getWarpSize(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); IndexType sharedMemory = 0; if( ! RowMajorOrder ) sharedMemory = cudaBlockSize.x * sizeof( RealType ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { Loading @@ -387,7 +384,6 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red cudaThreadSynchronize(); TNL_CHECK_CUDA_DEVICE; } } #endif } } Loading Loading @@ -493,7 +489,6 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize(); const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, RowMajorOrder, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx ); IndexType groupHeight = getWarpSize(); //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount ); bool compute( true ); IndexType localIdx( 0 ); RealType result( zero ); Loading @@ -501,23 +496,15 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx, { IndexType groupOffset = groupPointers[ groupIdx ]; const IndexType groupSize = groupPointers[ groupIdx + 1 ] - groupOffset; //printf( "groupSize = %d \n", groupSize ); if( groupSize ) { const IndexType groupWidth = groupSize / groupHeight; for( IndexType i = 0; i < groupWidth; i++ ) { if( RowMajorOrder ) { reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm * groupWidth + i, compute ) ); } else { /*printf( "segmentIdx = %d localIdx = %d globalIdx = %d groupIdx = %d groupSize = %d groupWidth = %d\n", segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, groupIdx, groupSize, groupWidth );*/ reduction( result, fetch( segmentIdx, localIdx, groupOffset + rowStripPerm + i * groupHeight, compute ) ); } localIdx++; } } Loading Loading @@ -561,9 +548,6 @@ segmentsReductionKernel( IndexType gridIdx, IndexType groupHeight = getWarpSize(); IndexType firstGroupIdx = strip * ( getLogWarpSize() + 1 ); RealType* temp( nullptr ); if( ! RowMajorOrder ) temp = Cuda::getSharedMemory< RealType >(); __shared__ RealType results[ BlockDim ]; results[ threadIdx.x ] = zero; __shared__ IndexType sharedGroupPointers[ 7 ]; // TODO: getLogWarpSize() + 1 ]; Loading @@ -573,14 +557,15 @@ segmentsReductionKernel( IndexType gridIdx, __syncthreads(); bool compute( true ); if( RowMajorOrder ) { for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; if( groupEnd - groupBegin > 0 ) { if( RowMajorOrder ) { if( inWarpIdx < groupHeight ) { const IndexType groupWidth = ( groupEnd - groupBegin ) / groupHeight; Loading @@ -589,17 +574,23 @@ segmentsReductionKernel( IndexType gridIdx, reduction( results[ threadIdx.x ], fetch( globalIdx++, compute ) ); } } groupHeight >>= 1; } } else { RealType* temp = Cuda::getSharedMemory< RealType >(); for( IndexType group = 0; group < getLogWarpSize() + 1; group++ ) { IndexType groupBegin = sharedGroupPointers[ group ]; IndexType groupEnd = sharedGroupPointers[ group + 1 ]; if( groupEnd - groupBegin > 0 ) { temp[ threadIdx.x ] = zero; IndexType globalIdx = groupBegin + inWarpIdx; while( globalIdx < groupEnd ) { reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) ); /*printf( "FETCH: globalIdx = %d fetch = %d result = %d groupEnd = %d \n", globalIdx, ( int ) fetch( globalIdx, compute ), ( int ) temp[ threadIdx.x ], groupEnd );*/ globalIdx += getWarpSize(); } // TODO: reduction via templates Loading @@ -613,9 +604,9 @@ segmentsReductionKernel( IndexType gridIdx, if( inWarpIdx < groupHeight ) reduction( results[ threadIdx.x ], temp[ threadIdx.x ] ); } } groupHeight >>= 1; } } __syncthreads(); if( warpStart + inWarpIdx >= last ) return; Loading