Loading src/Benchmarks/SpMV/spmv.h +8 −1 Original line number Diff line number Diff line Loading @@ -76,6 +76,9 @@ using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Mat template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >; template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRLight >; template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >; Loading Loading @@ -115,6 +118,9 @@ using SymmetricSparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, I template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRHybrid >; template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRLight >; template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRAdaptive >; Loading Loading @@ -591,6 +597,7 @@ benchmarkSpmv( BenchmarkType& benchmark, benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); Loading @@ -599,7 +606,7 @@ benchmarkSpmv( BenchmarkType& benchmark, #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); Loading src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +11 −7 Original line number Diff line number Diff line Loading @@ -47,8 +47,9 @@ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets, const Index maxID = offsets[ segmentIdx + 1]; Real result = zero; bool compute = true; for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); Loading Loading @@ -83,8 +84,9 @@ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets, const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); Loading Loading @@ -122,8 +124,9 @@ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets, const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); Loading Loading @@ -156,14 +159,14 @@ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets, if( segmentIdx >= last ) return; Index i; const Index inGroupID = threadIdx.x & 15; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 ) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); Loading Loading @@ -202,8 +205,9 @@ void SpMVCSRVector( OffsetsView offsets, Index endID = offsets[warpID + 1]; /* Calculate result */ bool compute = true; for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); Loading Loading @@ -304,7 +308,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee offsets, first, last, fetch, reduce, keep, zero, grid ); else // if (threadsPerSegment == 32) { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>( SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); } /*else Loading Loading
src/Benchmarks/SpMV/spmv.h +8 −1 Original line number Diff line number Diff line Loading @@ -76,6 +76,9 @@ using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Mat template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >; template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRLight >; template< typename Real, typename Device, typename Index > using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >; Loading Loading @@ -115,6 +118,9 @@ using SymmetricSparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, I template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRHybrid >; template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRLight >; template< typename Real, typename Device, typename Index > using SymmetricSparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRAdaptive >; Loading Loading @@ -591,6 +597,7 @@ benchmarkSpmv( BenchmarkType& benchmark, benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); Loading @@ -599,7 +606,7 @@ benchmarkSpmv( BenchmarkType& benchmark, #ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR ); Loading
src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +11 −7 Original line number Diff line number Diff line Loading @@ -47,8 +47,9 @@ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets, const Index maxID = offsets[ segmentIdx + 1]; Real result = zero; bool compute = true; for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) ); Loading Loading @@ -83,8 +84,9 @@ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets, const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) ); Loading Loading @@ -122,8 +124,9 @@ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets, const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) ); Loading Loading @@ -156,14 +159,14 @@ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets, if( segmentIdx >= last ) return; Index i; const Index inGroupID = threadIdx.x & 15; // & is cheaper than % const Index maxID = offsets[segmentIdx + 1]; Real result = zero; bool compute = true; for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 ) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Parallel reduction */ result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); Loading Loading @@ -202,8 +205,9 @@ void SpMVCSRVector( OffsetsView offsets, Index endID = offsets[warpID + 1]; /* Calculate result */ bool compute = true; for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize) result = reduce( result, fetch( i, true ) ); result = reduce( result, fetch( i, compute ) ); /* Reduction */ result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) ); Loading Loading @@ -304,7 +308,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee offsets, first, last, fetch, reduce, keep, zero, grid ); else // if (threadsPerSegment == 32) { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>( SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>( offsets, first, last, fetch, reduce, keep, zero, grid ); } /*else Loading