Loading src/TNL/Containers/Segments/ChunkedEllpackView.h +18 −0 Original line number Diff line number Diff line Loading @@ -148,6 +148,21 @@ class ChunkedEllpackView protected: #ifdef HAVE_CUDA template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ void segmentsReductionKernelWithAllParameters( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const; template< typename Fetch, typename Reduction, typename ResultKeeper, Loading Loading @@ -206,6 +221,9 @@ class ChunkedEllpackView ResultKeeper_ keeper, Real_ zero, Args_... args ); template< typename Index_, typename Fetch_, bool B_ > friend struct details::ChunkedEllpackSegmentsReductionDispatcher; #endif }; } // namespace Segements Loading src/TNL/Containers/Segments/ChunkedEllpackView.hpp +130 −92 Original line number Diff line number Diff line Loading @@ -20,30 +20,6 @@ namespace TNL { namespace Containers { namespace Segments { #ifdef HAVE_CUDA template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif template< typename Device, typename Index, bool RowMajorOrder > Loading Loading @@ -460,9 +436,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); //ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif } Loading Loading @@ -571,7 +547,7 @@ template< typename Device, __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, segmentsReductionKernelWithAllParameters( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Loading @@ -582,17 +558,14 @@ segmentsReductionKernel( IndexType gridIdx, { using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx >= lastSlice ) const IndexType firstSlice = rowToSliceMapping[ first ]; const IndexType lastSlice = rowToSliceMapping[ last - 1 ]; const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx > lastSlice ) return; RealType* chunksResults = Cuda::getSharedMemory< RealType >(); //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; Loading @@ -616,23 +589,89 @@ segmentsReductionKernel( IndexType gridIdx, IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) ); } else { const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) ); } __syncthreads(); if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx.x; IndexType chunkIndex( 0 ); if( threadIdx.x != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); if( row >= first && row < last ) keeper( row, result ); } } template< typename Device, typename Index, bool RowMajorOrder > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToSliceMapping[ first ]; const IndexType lastSlice = rowToSliceMapping[ last - 1 ]; const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx > lastSlice ) return; RealType* chunksResults = Cuda::getSharedMemory< RealType >(); __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx.x ] = zero; __syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) ); } else { const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) ); } __syncthreads(); //} //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ //if( threadIdx == 0 ) // sliceInfo = this->slices[ sliceIdx ]; if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx.x; Loading @@ -643,10 +682,9 @@ segmentsReductionKernel( IndexType gridIdx, RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); if( row >= first && row < last ) keeper( row, result ); } //} // threadIdx } // sliceIdx } #endif Loading src/TNL/Containers/Segments/details/CheckLambdas.h +7 −48 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ namespace TNL { template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsSegmentIdxAndCompute class CheckFetchLambda { private: typedef char YesType[1]; Loading @@ -27,52 +27,11 @@ class CheckFetchLambdaAcceptsSegmentIdxAndCompute template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsSegmentIdx { private: typedef char YesType[1]; typedef char NoType[2]; template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsCompute { private: typedef char YesType[1]; typedef char NoType[2]; template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambda { static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value; static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value; static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value; public: static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute || AcceptsSegmentIdx; }; static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute || AcceptsCompute; }; static constexpr bool hasAllParameters() { return value; }; }; } // namespace details Loading src/TNL/Containers/Segments/details/ChunkedEllpack.h +76 −0 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ #include <type_traits> #include <TNL/Containers/Vector.h> #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h> #include <TNL/Containers/Segments/details/CheckLambdas.h> namespace TNL { namespace Containers { Loading Loading @@ -223,6 +224,81 @@ class ChunkedEllpack chunksInSlice ); } }; #ifdef HAVE_CUDA template< typename Index, typename Fetch, bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() > struct ChunkedEllpackSegmentsReductionDispatcher{}; template< typename Index, typename Fetch > struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true > { template< typename View, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ static void exec( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename Index, typename Fetch > struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false > { template< typename View, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ static void exec( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif } //namespace details } //namespace Segments } //namespace Containers Loading src/TNL/Containers/Segments/details/LambdaAdapter.h +3 −3 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ namespace TNL { template< typename Index, typename Lambda, bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() > bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() > struct FetchLambdaAdapter { }; Loading @@ -42,10 +42,10 @@ template< typename Index, typename Lambda > struct FetchLambdaAdapter< Index, Lambda, false > { using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) ); using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) ); static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute ) { return f( localIdx, globalIdx, compute ); return f( globalIdx, compute ); } }; Loading Loading
src/TNL/Containers/Segments/ChunkedEllpackView.h +18 −0 Original line number Diff line number Diff line Loading @@ -148,6 +148,21 @@ class ChunkedEllpackView protected: #ifdef HAVE_CUDA template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ void segmentsReductionKernelWithAllParameters( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const; template< typename Fetch, typename Reduction, typename ResultKeeper, Loading Loading @@ -206,6 +221,9 @@ class ChunkedEllpackView ResultKeeper_ keeper, Real_ zero, Args_... args ); template< typename Index_, typename Fetch_, bool B_ > friend struct details::ChunkedEllpackSegmentsReductionDispatcher; #endif }; } // namespace Segements Loading
src/TNL/Containers/Segments/ChunkedEllpackView.hpp +130 −92 Original line number Diff line number Diff line Loading @@ -20,30 +20,6 @@ namespace TNL { namespace Containers { namespace Segments { #ifdef HAVE_CUDA template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif template< typename Device, typename Index, bool RowMajorOrder > Loading Loading @@ -460,9 +436,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); //ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> // ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif } Loading Loading @@ -571,7 +547,7 @@ template< typename Device, __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, segmentsReductionKernelWithAllParameters( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Loading @@ -582,17 +558,14 @@ segmentsReductionKernel( IndexType gridIdx, { using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; //for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx >= lastSlice ) const IndexType firstSlice = rowToSliceMapping[ first ]; const IndexType lastSlice = rowToSliceMapping[ last - 1 ]; const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx > lastSlice ) return; RealType* chunksResults = Cuda::getSharedMemory< RealType >(); //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; Loading @@ -616,23 +589,89 @@ segmentsReductionKernel( IndexType gridIdx, IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) ); } else { const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute ) ); } __syncthreads(); if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx.x; IndexType chunkIndex( 0 ); if( threadIdx.x != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); if( row >= first && row < last ) keeper( row, result ); } } template< typename Device, typename Index, bool RowMajorOrder > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToSliceMapping[ first ]; const IndexType lastSlice = rowToSliceMapping[ last - 1 ]; const IndexType sliceIdx = firstSlice + gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx > lastSlice ) return; RealType* chunksResults = Cuda::getSharedMemory< RealType >(); __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx.x == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx.x ] = zero; __syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx.x; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx.x * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) ); } else { const IndexType begin = sliceOffset + threadIdx.x; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx.x ], fetch( segmentIdx, localIdx++, j, compute, args...) ); reduction( chunksResults[ threadIdx.x ], fetch( j, compute ) ); } __syncthreads(); //} //details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; //for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) //{ //if( threadIdx == 0 ) // sliceInfo = this->slices[ sliceIdx ]; if( threadIdx.x < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx.x; Loading @@ -643,10 +682,9 @@ segmentsReductionKernel( IndexType gridIdx, RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); if( row >= first && row < last ) keeper( row, result ); } //} // threadIdx } // sliceIdx } #endif Loading
src/TNL/Containers/Segments/details/CheckLambdas.h +7 −48 Original line number Diff line number Diff line Loading @@ -18,7 +18,7 @@ namespace TNL { template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsSegmentIdxAndCompute class CheckFetchLambda { private: typedef char YesType[1]; Loading @@ -27,52 +27,11 @@ class CheckFetchLambdaAcceptsSegmentIdxAndCompute template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index(), std::declval< bool& >() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsSegmentIdx { private: typedef char YesType[1]; typedef char NoType[2]; template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), Index() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambdaAcceptsCompute { private: typedef char YesType[1]; typedef char NoType[2]; template< typename C > static YesType& test( decltype(std::declval< C >()( Index(), Index(), std::declval< bool& >() ) ) ); template< typename C > static NoType& test(...); public: static constexpr bool value = ( sizeof( test< Lambda >(0) ) == sizeof( YesType ) ); }; template< typename Index, typename Lambda > class CheckFetchLambda { static constexpr bool AcceptsSegmentIdxAndCompute = CheckFetchLambdaAcceptsSegmentIdxAndCompute< Index, Lambda >::value; static constexpr bool AcceptsSegmentIdx = CheckFetchLambdaAcceptsSegmentIdx< Index, Lambda >::value; static constexpr bool AcceptsCompute = CheckFetchLambdaAcceptsCompute< Index, Lambda >::value; public: static constexpr bool acceptsSegmentIdx() { return AcceptsSegmentIdxAndCompute || AcceptsSegmentIdx; }; static constexpr bool acceptsCompute() { return AcceptsSegmentIdxAndCompute || AcceptsCompute; }; static constexpr bool hasAllParameters() { return value; }; }; } // namespace details Loading
src/TNL/Containers/Segments/details/ChunkedEllpack.h +76 −0 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ #include <type_traits> #include <TNL/Containers/Vector.h> #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h> #include <TNL/Containers/Segments/details/CheckLambdas.h> namespace TNL { namespace Containers { Loading Loading @@ -223,6 +224,81 @@ class ChunkedEllpack chunksInSlice ); } }; #ifdef HAVE_CUDA template< typename Index, typename Fetch, bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() > struct ChunkedEllpackSegmentsReductionDispatcher{}; template< typename Index, typename Fetch > struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, true > { template< typename View, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ static void exec( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernelWithAllParameters( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename Index, typename Fetch > struct ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch, false > { template< typename View, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __device__ static void exec( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { chunkedEllpack.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { ChunkedEllpackSegmentsReductionDispatcher< Index, Fetch >::exec( chunkedEllpack, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif } //namespace details } //namespace Segments } //namespace Containers Loading
src/TNL/Containers/Segments/details/LambdaAdapter.h +3 −3 Original line number Diff line number Diff line Loading @@ -21,7 +21,7 @@ namespace TNL { template< typename Index, typename Lambda, bool AcceptsSegmentIdx = CheckFetchLambda< Index, Lambda >::acceptsSegmentIdx() > bool AllParameters = CheckFetchLambda< Index, Lambda >::hasAllParameters() > struct FetchLambdaAdapter { }; Loading @@ -42,10 +42,10 @@ template< typename Index, typename Lambda > struct FetchLambdaAdapter< Index, Lambda, false > { using ReturnType = decltype( std::declval< Lambda >()( Index(), Index(), std::declval< bool& >() ) ); using ReturnType = decltype( std::declval< Lambda >()( Index(), std::declval< bool& >() ) ); static ReturnType call( Lambda& f, Index segmentIdx, Index localIdx, Index globalIdx, bool& compute ) { return f( localIdx, globalIdx, compute ); return f( globalIdx, compute ); } }; Loading