Loading src/TNL/Algorithms/Segments/EllpackView.h +2 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ namespace TNL { namespace Algorithms { namespace Segments { enum EllpackKernelType { Scalar, Vector, Vector2, Vector4, Vector8, Vector16 }; template< typename Device, typename Index, ElementsOrganization Organization = Segments::DefaultElementsOrganization< Device >::getOrganization(), Loading src/TNL/Algorithms/Segments/EllpackView.hpp +135 −12 Original line number Diff line number Diff line Loading @@ -19,6 +19,124 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > __global__ void EllpackCudaReductionKernelFull( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) { const int warpSize = 32; const int gridID = 0; const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (segmentIdx >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index begin = segmentIdx * segmentSize; const Index end = begin + segmentSize; /* Calculate result */ Index localIdx( 0 ); bool compute( true ); for( Index i = begin + laneID; i < end; i += warpSize) result = reduction( result, fetch( segmentIdx, localIdx++, i, compute ) ); /* Reduction */ result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( segmentIdx, result ); } template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > __global__ void EllpackCudaReductionKernelCompact( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) { const int warpSize = 32; const int gridID = 0; const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (segmentIdx >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index begin = segmentIdx * segmentSize; const Index end = begin + segmentSize; /* Calculate result */ bool compute( true ); for( Index i = begin + laneID; i < end; i += warpSize) result = reduction( result, fetch( i, compute ) ); /* Reduction */ result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( segmentIdx, result ); } #endif template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, bool FullFetch = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() > struct EllpackCudaReductionDispatcher { static void exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); #endif } }; template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Real, false > { static void exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); #endif } }; template< typename Device, typename Index, Loading Loading @@ -276,6 +394,10 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) ); using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; if( Organization == RowMajorOrder ) { if( std::is_same< Device, Devices::Cuda >::value ) EllpackCudaReductionDispatcher< IndexType, Fetch, Reduction, ResultKeeper, Real>::exec( first, last, fetch, reduction, keeper, zero, segmentSize ); else { const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable { Loading @@ -285,11 +407,12 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& IndexType localIdx( 0 ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l ); } } else { const IndexType storageSize = this->getStorageSize(); Loading src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +9 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >; Loading src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h +9 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >; Loading Loading
src/TNL/Algorithms/Segments/EllpackView.h +2 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,8 @@ namespace TNL { namespace Algorithms { namespace Segments { enum EllpackKernelType { Scalar, Vector, Vector2, Vector4, Vector8, Vector16 }; template< typename Device, typename Index, ElementsOrganization Organization = Segments::DefaultElementsOrganization< Device >::getOrganization(), Loading
src/TNL/Algorithms/Segments/EllpackView.hpp +135 −12 Original line number Diff line number Diff line Loading @@ -19,6 +19,124 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > __global__ void EllpackCudaReductionKernelFull( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) { const int warpSize = 32; const int gridID = 0; const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (segmentIdx >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index begin = segmentIdx * segmentSize; const Index end = begin + segmentSize; /* Calculate result */ Index localIdx( 0 ); bool compute( true ); for( Index i = begin + laneID; i < end; i += warpSize) result = reduction( result, fetch( segmentIdx, localIdx++, i, compute ) ); /* Reduction */ result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( segmentIdx, result ); } template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > __global__ void EllpackCudaReductionKernelCompact( Index first, Index last, Fetch fetch, const Reduction reduction, ResultKeeper keep, const Real zero, Index segmentSize ) { const int warpSize = 32; const int gridID = 0; const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (segmentIdx >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % const Index begin = segmentIdx * segmentSize; const Index end = begin + segmentSize; /* Calculate result */ bool compute( true ); for( Index i = begin + laneID; i < end; i += warpSize) result = reduction( result, fetch( i, compute ) ); /* Reduction */ result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduction( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( segmentIdx, result ); } #endif template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, bool FullFetch = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() > struct EllpackCudaReductionDispatcher { static void exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); #endif } }; template< typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Real, false > { static void exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); cudaDeviceSynchronize(); #endif } }; template< typename Device, typename Index, Loading Loading @@ -276,6 +394,10 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >() ) ); using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; if( Organization == RowMajorOrder ) { if( std::is_same< Device, Devices::Cuda >::value ) EllpackCudaReductionDispatcher< IndexType, Fetch, Reduction, ResultKeeper, Real>::exec( first, last, fetch, reduction, keeper, zero, segmentSize ); else { const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType segmentIdx ) mutable { Loading @@ -285,11 +407,12 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& IndexType localIdx( 0 ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l ); } } else { const IndexType storageSize = this->getStorageSize(); Loading
src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +9 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >; Loading
src/UnitTests/Matrices/SparseMatrixVectorProductTest_Ellpack.h +9 −1 Original line number Diff line number Diff line Loading @@ -46,7 +46,15 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack > TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, ColumnMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, RowMajorEllpack > #endif >; Loading