diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h index b8066f63538088e692ac322a4f1dc85920ca717e..1a14db3384e3f74b9e20df3ed3f3f1fe83a07eb1 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.h +++ b/src/TNL/Algorithms/Segments/EllpackView.h @@ -22,6 +22,8 @@ namespace TNL { namespace Algorithms { namespace Segments { +enum EllpackKernelType { Scalar, Vector, Vector2, Vector4, Vector8, Vector16 }; + template< typename Device, typename Index, ElementsOrganization Organization = Segments::DefaultElementsOrganization< Device >::getOrganization(), diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp index 7abf2caed6f0218f6cee1715a163c5bbebe140f7..6f49c55eef3bf277722b46c728b56c68b3a4f4dc 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.hpp +++ b/src/TNL/Algorithms/Segments/EllpackView.hpp @@ -19,6 +19,124 @@ namespace TNL { namespace Algorithms { namespace Segments { +#ifdef HAVE_CUDA +template< typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real > +__global__ void +EllpackCudaReductionKernelFull( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) +{ + const int warpSize = 32; + const int gridID = 0; + const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + if (segmentIdx >= last) + return; + + Real result = zero; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + const Index begin = segmentIdx * segmentSize; + const Index end = begin + segmentSize; + + /* Calculate result */ + Index localIdx( 0 ); + bool compute( true ); + for( Index i = begin + laneID; i < end; i += warpSize) + result = reduction( result, fetch( segmentIdx, localIdx++, i, compute ) ); + + /* Reduction */ + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); + /* Write result */ + if( laneID == 0 ) + keep( segmentIdx, result ); +} + +template< typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real > +__global__ void +EllpackCudaReductionKernelCompact( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) +{ + const int warpSize = 32; + const int gridID = 0; + const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + if (segmentIdx >= last) + return; + + Real result = zero; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + const Index begin = segmentIdx * segmentSize; + const Index end = begin + segmentSize; + + /* Calculate result */ + bool compute( true ); + for( Index i = begin + laneID; i < end; i += warpSize) + result = reduction( result, fetch( i, compute ) ); + + /* Reduction */ + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); + result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); + /* Write result */ + if( laneID == 0 ) + keep( segmentIdx, result ); + +} +#endif + +template< typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + bool FullFetch = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() > +struct EllpackCudaReductionDispatcher +{ + static void + exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) + { + #ifdef HAVE_CUDA + const Index segmentsCount = last - first; + const Index threadsCount = segmentsCount * 32; + const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); + dim3 blockSize( 256 ); + dim3 gridSize( blocksCount ); + EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); + cudaDeviceSynchronize(); + #endif + } +}; + +template< typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real > +struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Real, false > +{ + static void + exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) + { + #ifdef HAVE_CUDA + const Index segmentsCount = last - first; + const Index threadsCount = segmentsCount * 32; + const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); + dim3 blockSize( 256 ); + dim3 gridSize( blocksCount ); + EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); + cudaDeviceSynchronize(); + #endif + } +}; template< typename Device, typename Index, @@ -277,18 +395,23 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; if( Organization == RowMajorOrder ) { - const IndexType segmentSize = this->segmentSize; - auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable { - const IndexType begin = segmentIdx * segmentSize; - const IndexType end = begin + segmentSize; - RealType aux( zero ); - IndexType localIdx( 0 ); - bool compute( true ); - for( IndexType j = begin; j < end && compute; j++ ) - aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); - keeper( segmentIdx, aux ); - }; - Algorithms::ParallelFor< Device >::exec( first, last, l ); + if( std::is_same< Device, Devices::Cuda >::value ) + EllpackCudaReductionDispatcher< IndexType, Fetch, Reduction, ResultKeeper, Real>::exec( first, last, fetch, reduction, keeper, zero, segmentSize ); + else + { + const IndexType segmentSize = this->segmentSize; + auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable { + const IndexType begin = segmentIdx * segmentSize; + const IndexType end = begin + segmentSize; + RealType aux( zero ); + IndexType localIdx( 0 ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j++ ) + aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); + keeper( segmentIdx, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l ); + } } else { diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h index ef56ec63a46561dd2dffc723c780076ce7d671a6..b13a19c6a4e3eafa51d401091a267aea2cb18cd3 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >; diff --git a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h index abb4213ca2efb4912c8dab33bbec4d7d3d006665..c93aace755ec0e960ac080a9fb0a7c3323adc819 100644 --- a/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h +++ b/src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >;