Loading src/TNL/Containers/Segments/ChunkedEllpack.h +3 −0 Original line number Diff line number Diff line Loading @@ -151,6 +151,9 @@ class ChunkedEllpack ChunkedEllpackSliceInfoContainer slices; IndexType numberOfSlices; template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ > friend class ChunkedEllpack; }; } // namespace Segements Loading src/TNL/Containers/Segments/ChunkedEllpack.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -446,7 +446,7 @@ operator=( const ChunkedEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder this->rowToChunkMapping = source.rowToChunkMapping; this->rowToSliceMapping = source.rowToSliceMapping; this->rowPointers = source.rowPointers; this->chunksToSegmentMapping = source.chunksToSegmentsMapping; this->chunksToSegmentsMapping = source.chunksToSegmentsMapping; this->slices = source.slices; this->numberOfSlices = source.numberOfSlices; return *this; Loading src/TNL/Containers/Segments/ChunkedEllpackView.h +23 −1 Original line number Diff line number Diff line Loading @@ -147,12 +147,13 @@ class ChunkedEllpackView protected: #ifdef HAVE_CUDA template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__device__ __device__ void segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Loading @@ -161,6 +162,7 @@ class ChunkedEllpackView ResultKeeper keeper, Real zero, Args... args ) const; #endif IndexType size = 0, storageSize = 0; Loading @@ -187,6 +189,26 @@ class ChunkedEllpackView ChunkedEllpackSliceInfoContainerView slices; IndexType numberOfSlices; #ifdef HAVE_CUDA template< typename View_, typename Index_, typename Fetch_, typename Reduction_, typename ResultKeeper_, typename Real_, typename... Args_ > friend __global__ void ChunkedEllpackSegmentsReductionKernel( View_ chunkedEllpack, Index_ gridIdx, Index_ first, Index_ last, Fetch_ fetch, Reduction_ reduction, ResultKeeper_ keeper, Real_ zero, Args_... args ); #endif }; } // namespace Segements } // namespace Conatiners Loading src/TNL/Containers/Segments/ChunkedEllpackView.hpp +83 −68 Original line number Diff line number Diff line Loading @@ -38,7 +38,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Real zero, Args... args ) { view.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif Loading Loading @@ -337,20 +337,29 @@ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { if( std::is_same< DeviceType, Devices::Host >::value ) { for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) IndexType firstSliceIdx( 0 ), lastSliceIdx( numberOfSlices ); if( first != 0 || last != this->size ) { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); firstSliceIdx = rowToSliceMapping.getElement( first ); lastSliceIdx = rowToSliceMapping.getElement( last - 1 ) + 1; } const IndexType chunksInSlice = this->chunksInSlice; auto rowToChunkMapping = this->rowToChunkMapping; auto rowToSliceMapping = this->rowToSliceMapping; auto slices = this->slices; auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = rowToSliceMapping[ segmentIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) if( segmentIdx != slices[ sliceIdx ].firstSegment ) { firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; } const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; const IndexType sliceOffset = slices[ sliceIdx ].pointer; const IndexType chunkSize = slices[ sliceIdx ].chunkSize; const IndexType segmentSize = segmentChunksCount * chunkSize; bool compute( true ); Loading @@ -364,17 +373,19 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } else { IndexType localIdx( 0 ); for( IndexType chunkIdx = 0; chunkIdx < segmentChunksCount; chunkIdx++ ) { IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx; IndexType end = begin + chunksInSlice * chunkSize; IndexType localIdx( 0 ); for( IndexType j = begin; j < end && compute; j += chunksInSlice ) { f( segmentIdx, localIdx++, j, compute, args...); } } } } }; Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... ); } template< typename Device, Loading @@ -399,8 +410,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( std::is_same< DeviceType, Devices::Host >::value ) { segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); return; //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); //return; for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) { Loading Loading @@ -442,7 +453,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red } if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; #ifdef HAVE_CUDA //const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; // TODO: This ignores parameters first and last const IndexType cudaBlocks = this->numberOfSlices; const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); dim3 cudaBlockSize( this->chunksInSlice ), cudaGridSize; Loading @@ -452,10 +465,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); }*/ } #endif } } Loading Loading @@ -550,6 +564,7 @@ printStructure( std::ostream& str ) const << " chunk = " << this->rowToChunkMapping.getElement( i ) << std::endl; } #ifdef HAVE_CUDA template< typename Device, typename Index, bool RowMajorOrder > Loading @@ -558,7 +573,7 @@ template< typename Device, typename ResultKeeper, typename Real, typename... Args > //__cuda_callable__ __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, Loading @@ -574,60 +589,60 @@ segmentsReductionKernel( IndexType gridIdx, const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { //const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; //if( sliceIdx >= lastSlice ) // return; const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx >= lastSlice ) return; RealType chunksResults[ 256 ]; //Cuda::getSharedMemory< RealType >(); for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx == 0 ) RealType* chunksResults = Cuda::getSharedMemory< RealType >(); //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx ] = zero; //__syncthreads(); chunksResults[ threadIdx.x ] = zero; __syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x; const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != sliceInfo.firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; IndexType localIdx = ( threadIdx - firstChunkOfSegment ) * chunkSize; IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } else { const IndexType begin = sliceOffset + threadIdx; // threadIdx.x = chunkIdx within the slice const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } //__syncthreads(); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } // TODO: finish details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; if( threadIdx < sliceInfo.size ) __syncthreads(); //} //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ //if( threadIdx == 0 ) // sliceInfo = this->slices[ sliceIdx ]; if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx; const IndexType row = sliceInfo.firstSegment + threadIdx.x; IndexType chunkIndex( 0 ); if( threadIdx != 0 ) if( threadIdx.x != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); Loading @@ -635,11 +650,11 @@ segmentsReductionKernel( IndexType gridIdx, reduction( result, chunksResults[ chunkIndex++ ] ); keeper( row, result ); } } // threadIdx //} // threadIdx } // sliceIdx } #endif } // namespace Segments } // namespace Conatiners } // namespace Containers } // namespace TNL src/TNL/Containers/Segments/details/ChunkedEllpack.h +1 −1 Original line number Diff line number Diff line Loading @@ -97,7 +97,7 @@ class ChunkedEllpack const IndexType& sliceIndex = segmentsToSlicesMapping.getElement( segmentIdx ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices.getElement( sliceIndex ).firstSegment ) firstChunkOfSegment = segmentsToChunksMapping[ segmentIdx - 1 ]; firstChunkOfSegment = segmentsToChunksMapping.getElement( segmentIdx - 1 ); const IndexType lastChunkOfSegment = segmentsToChunksMapping.getElement( segmentIdx ); const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; Loading Loading
src/TNL/Containers/Segments/ChunkedEllpack.h +3 −0 Original line number Diff line number Diff line Loading @@ -151,6 +151,9 @@ class ChunkedEllpack ChunkedEllpackSliceInfoContainer slices; IndexType numberOfSlices; template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_ > friend class ChunkedEllpack; }; } // namespace Segements Loading
src/TNL/Containers/Segments/ChunkedEllpack.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -446,7 +446,7 @@ operator=( const ChunkedEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder this->rowToChunkMapping = source.rowToChunkMapping; this->rowToSliceMapping = source.rowToSliceMapping; this->rowPointers = source.rowPointers; this->chunksToSegmentMapping = source.chunksToSegmentsMapping; this->chunksToSegmentsMapping = source.chunksToSegmentsMapping; this->slices = source.slices; this->numberOfSlices = source.numberOfSlices; return *this; Loading
src/TNL/Containers/Segments/ChunkedEllpackView.h +23 −1 Original line number Diff line number Diff line Loading @@ -147,12 +147,13 @@ class ChunkedEllpackView protected: #ifdef HAVE_CUDA template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__device__ __device__ void segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Loading @@ -161,6 +162,7 @@ class ChunkedEllpackView ResultKeeper keeper, Real zero, Args... args ) const; #endif IndexType size = 0, storageSize = 0; Loading @@ -187,6 +189,26 @@ class ChunkedEllpackView ChunkedEllpackSliceInfoContainerView slices; IndexType numberOfSlices; #ifdef HAVE_CUDA template< typename View_, typename Index_, typename Fetch_, typename Reduction_, typename ResultKeeper_, typename Real_, typename... Args_ > friend __global__ void ChunkedEllpackSegmentsReductionKernel( View_ chunkedEllpack, Index_ gridIdx, Index_ first, Index_ last, Fetch_ fetch, Reduction_ reduction, ResultKeeper_ keeper, Real_ zero, Args_... args ); #endif }; } // namespace Segements } // namespace Conatiners Loading
src/TNL/Containers/Segments/ChunkedEllpackView.hpp +83 −68 Original line number Diff line number Diff line Loading @@ -38,7 +38,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Real zero, Args... args ) { view.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif Loading Loading @@ -337,20 +337,29 @@ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { if( std::is_same< DeviceType, Devices::Host >::value ) { for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) IndexType firstSliceIdx( 0 ), lastSliceIdx( numberOfSlices ); if( first != 0 || last != this->size ) { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); firstSliceIdx = rowToSliceMapping.getElement( first ); lastSliceIdx = rowToSliceMapping.getElement( last - 1 ) + 1; } const IndexType chunksInSlice = this->chunksInSlice; auto rowToChunkMapping = this->rowToChunkMapping; auto rowToSliceMapping = this->rowToSliceMapping; auto slices = this->slices; auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = rowToSliceMapping[ segmentIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) if( segmentIdx != slices[ sliceIdx ].firstSegment ) { firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; } const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; const IndexType sliceOffset = slices[ sliceIdx ].pointer; const IndexType chunkSize = slices[ sliceIdx ].chunkSize; const IndexType segmentSize = segmentChunksCount * chunkSize; bool compute( true ); Loading @@ -364,17 +373,19 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } else { IndexType localIdx( 0 ); for( IndexType chunkIdx = 0; chunkIdx < segmentChunksCount; chunkIdx++ ) { IndexType begin = sliceOffset + firstChunkOfSegment + chunkIdx; IndexType end = begin + chunksInSlice * chunkSize; IndexType localIdx( 0 ); for( IndexType j = begin; j < end && compute; j += chunksInSlice ) { f( segmentIdx, localIdx++, j, compute, args...); } } } } }; Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... ); } template< typename Device, Loading @@ -399,8 +410,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( std::is_same< DeviceType, Devices::Host >::value ) { segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); return; //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); //return; for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) { Loading Loading @@ -442,7 +453,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red } if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; #ifdef HAVE_CUDA //const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; // TODO: This ignores parameters first and last const IndexType cudaBlocks = this->numberOfSlices; const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); dim3 cudaBlockSize( this->chunksInSlice ), cudaGridSize; Loading @@ -452,10 +465,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); }*/ } #endif } } Loading Loading @@ -550,6 +564,7 @@ printStructure( std::ostream& str ) const << " chunk = " << this->rowToChunkMapping.getElement( i ) << std::endl; } #ifdef HAVE_CUDA template< typename Device, typename Index, bool RowMajorOrder > Loading @@ -558,7 +573,7 @@ template< typename Device, typename ResultKeeper, typename Real, typename... Args > //__cuda_callable__ __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, Loading @@ -574,60 +589,60 @@ segmentsReductionKernel( IndexType gridIdx, const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { //const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; //if( sliceIdx >= lastSlice ) // return; const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx >= lastSlice ) return; RealType chunksResults[ 256 ]; //Cuda::getSharedMemory< RealType >(); for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx == 0 ) RealType* chunksResults = Cuda::getSharedMemory< RealType >(); //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx ] = zero; //__syncthreads(); chunksResults[ threadIdx.x ] = zero; __syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x; const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != sliceInfo.firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; IndexType localIdx = ( threadIdx - firstChunkOfSegment ) * chunkSize; IndexType localIdx = ( threadIdx.x - firstChunkOfSegment ) * chunkSize; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } else { const IndexType begin = sliceOffset + threadIdx; // threadIdx.x = chunkIdx within the slice const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } //__syncthreads(); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } // TODO: finish details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; if( threadIdx < sliceInfo.size ) __syncthreads(); //} //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ //if( threadIdx == 0 ) // sliceInfo = this->slices[ sliceIdx ]; if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx; const IndexType row = sliceInfo.firstSegment + threadIdx.x; IndexType chunkIndex( 0 ); if( threadIdx != 0 ) if( threadIdx.x != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); Loading @@ -635,11 +650,11 @@ segmentsReductionKernel( IndexType gridIdx, reduction( result, chunksResults[ chunkIndex++ ] ); keeper( row, result ); } } // threadIdx //} // threadIdx } // sliceIdx } #endif } // namespace Segments } // namespace Conatiners } // namespace Containers } // namespace TNL
src/TNL/Containers/Segments/details/ChunkedEllpack.h +1 −1 Original line number Diff line number Diff line Loading @@ -97,7 +97,7 @@ class ChunkedEllpack const IndexType& sliceIndex = segmentsToSlicesMapping.getElement( segmentIdx ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices.getElement( sliceIndex ).firstSegment ) firstChunkOfSegment = segmentsToChunksMapping[ segmentIdx - 1 ]; firstChunkOfSegment = segmentsToChunksMapping.getElement( segmentIdx - 1 ); const IndexType lastChunkOfSegment = segmentsToChunksMapping.getElement( segmentIdx ); const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; Loading