Loading src/TNL/Algorithms/Segments/CSR.h +5 −1 Original line number Diff line number Diff line Loading @@ -546,6 +546,11 @@ template< typename Device, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > Loading @@ -556,7 +561,6 @@ template< typename Device, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRDefault = CSRScalar< Device, Index, IndexAllocator >; } // namespace Segments } // namespace Algorithms } // namespace TNL Loading src/TNL/Algorithms/Segments/CSRView.h +7 −1 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h> #include <TNL/Algorithms/Segments/SegmentsPrinting.h> Loading Loading @@ -163,9 +164,14 @@ template< typename Device, typename Index > using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; template< typename Device, typename Index, int ThreadsInBlock = 256 > using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >; template< typename Device, typename Index > using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >; using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; template< typename Device, typename Index > Loading src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h 0 → 100644 +64 −0 Original line number Diff line number Diff line /*************************************************************************** CSRLightKernel.h - description ------------------- begin : Jun 9, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRLightKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRLightKernel< Index, Device >; using ConstViewType = CSRLightKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > void reduceSegments( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const; protected: int threadsPerSegment = 0; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp> src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp 0 → 100644 +422 −0 Original line number Diff line number Diff line /*************************************************************************** CSRLightKernel.hpp - description ------------------- begin : Jun 9, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) { const Index segmentIdx = first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2; if( segmentIdx >= last ) return; const Index inGroupID = threadIdx.x & 1; // & is cheaper than % const Index maxID = offsets[ segmentIdx + 1]; Real result = zero; for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4; if (segmentIdx >= last) return; const Index inGroupID = threadIdx.x & 3; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8; if (segmentIdx >= last) return; Index i; const Index inGroupID = threadIdx.x & 7; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16; if( segmentIdx >= last ) return; Index i; const Index inGroupID = threadIdx.x & 15; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 ) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRVector( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const int warpSize = 32; const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (warpID >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % Index endID = offsets[warpID + 1]; /* Calculate result */ for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize) result = reduce( result, fetch( i, true ) ); /* Reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( warpID, result ); } #endif template< typename Index, typename Device, typename Fetch, typename Reduce, typename Keep, bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || std::is_same< Device, Devices::Host >::value > struct CSRLightKernelreduceSegmentsDispatcher; template< typename Index, typename Device, typename Fetch, typename Reduction, typename ResultKeeper > struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true > { template< typename Offsets, typename Real > static void reduce( const Offsets& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduce, ResultKeeper& keep, const Real& zero, const Index threadsPerSegment ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: reduceSegments( offsets, first, last, fetch, reduce, keep, zero ); } }; template< typename Index, typename Device, typename Fetch, typename Reduce, typename Keep > struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false > { template< typename OffsetsView, typename Real > static void reduce( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduce& reduce, Keep& keep, const Real& zero, const Index threadsPerSegment ) { #ifdef HAVE_CUDA const int threads = 128; Index blocks, groupSize; //if (KernelType == CSRLightWithoutAtomic) int neededThreads = threadsPerSegment * ( last - first ); //else // neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment); /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocks = TNL::Cuda::getMaxGridXSize(); neededThreads -= TNL::Cuda::getMaxGridXSize() * threads; } if (threadsPerSegment == 2) SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 4) SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 8) SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 16) SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else // if (threadsPerSegment == 32) { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); } /*else { // Execute CSR MultiVector SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>( inVector, outVector, matrix.getoffsets().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), rows, threadsPerSegment / 32, grid ); }*/ } #endif } }; template< typename Index, typename Device > template< typename Offsets > void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() ); const Index segmentsCount = offsets.getSize() - 1; //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size size_t neededThreads = segmentsCount * 32;//warpSize; Index blocks, threadsPerSegment; const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) threadsPerSegment = 2; else if( elementsInSegment <= 4 ) threadsPerSegment = 4; else if( elementsInSegment <= 8 ) threadsPerSegment = 8; else if( elementsInSegment <= 16 ) threadsPerSegment = 16; else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) threadsPerSegment = 32; // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 33, "" ); } template< typename Index, typename Device > void CSRLightKernel< Index, Device >:: reset() { this->threadsPerSegment = 0; } template< typename Index, typename Device > auto CSRLightKernel< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > TNL::String CSRLightKernel< Index, Device >:: getKernelType() { return "Light"; } template< typename Index, typename Device > auto CSRLightKernel< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduce, typename Keep, typename Real > void CSRLightKernel< Index, Device >:: reduceSegments( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduce& reduce, Keep& keep, const Real& zero ) const { TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce( offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment ); } } // namespace Segments } // namespace Algorithms } // namespace TNL src/UnitTests/Matrices/CMakeLists.txt +2 −0 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ set( COMMON_TESTS SparseMatrixTest_CSRScalar SparseMatrixTest_CSRVector SparseMatrixTest_CSRHybrid SparseMatrixTest_CSRLight SparseMatrixTest_CSRAdaptive SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack Loading @@ -16,6 +17,7 @@ set( COMMON_TESTS SparseMatrixVectorProductTest_CSRScalar SparseMatrixVectorProductTest_CSRVector SparseMatrixVectorProductTest_CSRHybrid SparseMatrixVectorProductTest_CSRLight SparseMatrixVectorProductTest_CSRAdaptive SparseMatrixVectorProductTest_Ellpack SparseMatrixVectorProductTest_SlicedEllpack Loading Loading
src/TNL/Algorithms/Segments/CSR.h +5 −1 Original line number Diff line number Diff line Loading @@ -546,6 +546,11 @@ template< typename Device, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > Loading @@ -556,7 +561,6 @@ template< typename Device, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRDefault = CSRScalar< Device, Index, IndexAllocator >; } // namespace Segments } // namespace Algorithms } // namespace TNL Loading
src/TNL/Algorithms/Segments/CSRView.h +7 −1 Original line number Diff line number Diff line Loading @@ -17,6 +17,7 @@ #include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h> #include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h> #include <TNL/Algorithms/Segments/SegmentsPrinting.h> Loading Loading @@ -163,9 +164,14 @@ template< typename Device, typename Index > using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; template< typename Device, typename Index, int ThreadsInBlock = 256 > using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >; template< typename Device, typename Index > using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >; using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; template< typename Device, typename Index > Loading
src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h 0 → 100644 +64 −0 Original line number Diff line number Diff line /*************************************************************************** CSRLightKernel.h - description ------------------- begin : Jun 9, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRLightKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRLightKernel< Index, Device >; using ConstViewType = CSRLightKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real > void reduceSegments( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero ) const; protected: int threadsPerSegment = 0; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>
src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp 0 → 100644 +422 −0 Original line number Diff line number Diff line /*************************************************************************** CSRLightKernel.hpp - description ------------------- begin : Jun 9, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) { const Index segmentIdx = first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2; if( segmentIdx >= last ) return; const Index inGroupID = threadIdx.x & 1; // & is cheaper than % const Index maxID = offsets[ segmentIdx + 1]; Real result = zero; for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4; if (segmentIdx >= last) return; const Index inGroupID = threadIdx.x & 3; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8; if (segmentIdx >= last) return; Index i; const Index inGroupID = threadIdx.x & 7; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const Index segmentIdx = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16; if( segmentIdx >= last ) return; Index i; const Index inGroupID = threadIdx.x & 15; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 ) result = reduce( result, fetch( i, true ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); /* Write result */ if( inGroupID == 0 ) keep( segmentIdx, result ); } template< typename Real, typename Index, typename OffsetsView, typename Fetch, typename Reduce, typename Keep > __global__ void SpMVCSRVector( OffsetsView offsets, const Index first, const Index last, Fetch fetch, Reduce reduce, Keep keep, const Real zero, const Index gridID ) { const int warpSize = 32; const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; if (warpID >= last) return; Real result = zero; const Index laneID = threadIdx.x & 31; // & is cheaper than % Index endID = offsets[warpID + 1]; /* Calculate result */ for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize) result = reduce( result, fetch( i, true ) ); /* Reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); /* Write result */ if( laneID == 0 ) keep( warpID, result ); } #endif template< typename Index, typename Device, typename Fetch, typename Reduce, typename Keep, bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || std::is_same< Device, Devices::Host >::value > struct CSRLightKernelreduceSegmentsDispatcher; template< typename Index, typename Device, typename Fetch, typename Reduction, typename ResultKeeper > struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true > { template< typename Offsets, typename Real > static void reduce( const Offsets& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduce, ResultKeeper& keep, const Real& zero, const Index threadsPerSegment ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: reduceSegments( offsets, first, last, fetch, reduce, keep, zero ); } }; template< typename Index, typename Device, typename Fetch, typename Reduce, typename Keep > struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false > { template< typename OffsetsView, typename Real > static void reduce( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduce& reduce, Keep& keep, const Real& zero, const Index threadsPerSegment ) { #ifdef HAVE_CUDA const int threads = 128; Index blocks, groupSize; //if (KernelType == CSRLightWithoutAtomic) int neededThreads = threadsPerSegment * ( last - first ); //else // neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment); /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads) { blocks = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocks = TNL::Cuda::getMaxGridXSize(); neededThreads -= TNL::Cuda::getMaxGridXSize() * threads; } if (threadsPerSegment == 2) SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 4) SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 8) SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else if (threadsPerSegment == 16) SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); else // if (threadsPerSegment == 32) { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); } /*else { // Execute CSR MultiVector SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>( inVector, outVector, matrix.getoffsets().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), rows, threadsPerSegment / 32, grid ); }*/ } #endif } }; template< typename Index, typename Device > template< typename Offsets > void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { //const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); //this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() ); const Index segmentsCount = offsets.getSize() - 1; //const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size size_t neededThreads = segmentsCount * 32;//warpSize; Index blocks, threadsPerSegment; const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) threadsPerSegment = 2; else if( elementsInSegment <= 4 ) threadsPerSegment = 4; else if( elementsInSegment <= 8 ) threadsPerSegment = 8; else if( elementsInSegment <= 16 ) threadsPerSegment = 16; else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) threadsPerSegment = 32; // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 33, "" ); } template< typename Index, typename Device > void CSRLightKernel< Index, Device >:: reset() { this->threadsPerSegment = 0; } template< typename Index, typename Device > auto CSRLightKernel< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > TNL::String CSRLightKernel< Index, Device >:: getKernelType() { return "Light"; } template< typename Index, typename Device > auto CSRLightKernel< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduce, typename Keep, typename Real > void CSRLightKernel< Index, Device >:: reduceSegments( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduce& reduce, Keep& keep, const Real& zero ) const { TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce( offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment ); } } // namespace Segments } // namespace Algorithms } // namespace TNL
src/UnitTests/Matrices/CMakeLists.txt +2 −0 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ set( COMMON_TESTS SparseMatrixTest_CSRScalar SparseMatrixTest_CSRVector SparseMatrixTest_CSRHybrid SparseMatrixTest_CSRLight SparseMatrixTest_CSRAdaptive SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack Loading @@ -16,6 +17,7 @@ set( COMMON_TESTS SparseMatrixVectorProductTest_CSRScalar SparseMatrixVectorProductTest_CSRVector SparseMatrixVectorProductTest_CSRHybrid SparseMatrixVectorProductTest_CSRLight SparseMatrixVectorProductTest_CSRAdaptive SparseMatrixVectorProductTest_Ellpack SparseMatrixVectorProductTest_SlicedEllpack Loading