Commit b9603592 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Fixing Light CSR kernel.

parent 45ad3fa7
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -76,6 +76,9 @@ using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Mat
template< typename Real, typename Device, typename Index >
using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >;

template< typename Real, typename Device, typename Index >
using SparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRLight >;

template< typename Real, typename Device, typename Index >
using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >;

@@ -115,6 +118,9 @@ using SymmetricSparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, I
template< typename Real, typename Device, typename Index >
using SymmetricSparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRHybrid >;

template< typename Real, typename Device, typename Index >
using SymmetricSparseMatrix_CSR_Light = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRLight >;

template< typename Real, typename Device, typename Index >
using SymmetricSparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::SymmetricMatrix, Algorithms::Segments::CSRAdaptive >;

@@ -591,6 +597,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid                   >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Light                    >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive                 >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_Ellpack                      >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkSpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
@@ -599,7 +606,7 @@ benchmarkSpmv( BenchmarkType& benchmark,
#ifdef WITH_TNL_BENCHMARK_SPMV_BINARY_MATRICES
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Scalar             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Vector             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Hybrid             >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Light              >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_CSR_Adaptive           >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_Ellpack                >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
   benchmarkBinarySpMV< Real, HostMatrixType, SparseMatrix_SlicedEllpack          >( benchmark, hostMatrix, hostOutVector, inputFileName, verboseMR );
+11 −7
Original line number Diff line number Diff line
@@ -47,8 +47,9 @@ void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
   const Index maxID = offsets[ segmentIdx  + 1];

   Real result = zero;
   bool compute = true;
   for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
      result = reduce( result, fetch( i, true ) );
      result = reduce( result, fetch( i, compute ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
@@ -83,8 +84,9 @@ void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   bool compute = true;
   for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
      result = reduce( result, fetch( i, true ) );
      result = reduce( result, fetch( i, compute ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
@@ -122,8 +124,9 @@ void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   bool compute = true;
   for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
      result = reduce( result, fetch( i, true ) );
      result = reduce( result, fetch( i, compute ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
@@ -156,14 +159,14 @@ void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
   if( segmentIdx >= last )
      return;


   Index i;
   const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
   const Index maxID = offsets[segmentIdx + 1];

   Real result = zero;
   bool compute = true;
   for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
      result = reduce( result, fetch( i, true ) );
      result = reduce( result, fetch( i, compute ) );

   /* Parallel reduction */
   result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
@@ -202,8 +205,9 @@ void SpMVCSRVector( OffsetsView offsets,
   Index endID = offsets[warpID + 1];

   /* Calculate result */
   bool compute = true;
   for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
      result = reduce( result, fetch( i, true ) );
      result = reduce( result, fetch( i, compute ) );

   /* Reduction */
   result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
@@ -304,7 +308,7 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
               offsets, first, last, fetch, reduce, keep, zero, grid );
         else // if (threadsPerSegment == 32)
         { // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
            SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
               offsets, first, last, fetch, reduce, keep, zero, grid );
         }
         /*else