Commit ac783cf2 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Adaptiver CSR kernel seems to be working well.

parent cadfb88a
Loading
Loading
Loading
Loading
+54 −26
Original line number Diff line number Diff line
@@ -113,51 +113,78 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
   else // blockType == Type::LONG - several warps per segment
   {
      // Number of elements processed by previous warps
      const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
         block.getWarpIdx() * MAX_ELEM_PER_WARP;
      Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
      ///   block.getWarpIdx() * MAX_ELEM_PER_WARP;
      //Index to = begin + (block.getWarpIdx()  + 1) * MAX_ELEM_PER_WARP;
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index end = offsets[segmentIdx + 1];
      const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      if( to > end )
         to = end;
      //if( to > end )
      //   to = end;
      TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
      result = zero;
      //printf( "tid %d : start = %d \n", tid, minID + laneID );
      for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize )
      //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx()  );
      for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
           globalIdx < end;
           globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
      {
         result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
         //if( laneIdx == 0 )
         //   printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
         //    details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
         //result += values[i] * inVector[columnIndexes[i]];
      }

      //printf( "tid %d -> %d \n", tid, result );

      result += __shfl_down_sync(0xFFFFFFFF, result, 16);
      result += __shfl_down_sync(0xFFFFFFFF, result, 8);
      result += __shfl_down_sync(0xFFFFFFFF, result, 4);
      result += __shfl_down_sync(0xFFFFFFFF, result, 2);
      result += __shfl_down_sync(0xFFFFFFFF, result, 1);

      //if( laneIdx == 0 )
      //   printf( "WARP RESULT: tid %d -> %d \n", tid, result );

      const Index warpID = threadIdx.x / 32;
      if( laneIdx == 0 )
         multivectorShared[ warpID ] = result;

      __syncthreads();
      // Reduction in multivectorShared
      if( tid < 16 )
      if( block.getWarpIdx() == 0 && laneIdx < 16 )
      {
         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] );
         constexpr int totalWarps = CudaBlockSize / warpSize;
         if( totalWarps >= 32 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
            __syncwarp();
         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  8 ] );
         }
         if( totalWarps >= 16 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  8 ] );
            __syncwarp();
         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  4 ] );
         }
         if( totalWarps >= 8 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  4 ] );
            __syncwarp();
         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  2 ] );
         }
         if( totalWarps >= 4 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  2 ] );
            __syncwarp();
         multivectorShared[ tid ] =  reduce( multivectorShared[ tid ], multivectorShared[ tid +  1 ] );
         }
         if( totalWarps >= 2 )
         {
            multivectorShared[ laneIdx ] =  reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx +  1 ] );
            __syncwarp();
         if( tid == 0 )
         }
         if( laneIdx == 0 )
         {
            printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
            //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
            keep( segmentIdx, multivectorShared[ 0 ] );
         }
      }
@@ -216,7 +243,6 @@ struct CSRKernelAdaptiveView
         return;
      }

      this->printBlocks();
      static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
      //static constexpr Index THREADS_SCALAR = 128;
      //static constexpr Index THREADS_VECTOR = 128;
@@ -322,7 +348,7 @@ struct CSRKernelAdaptive
    static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

   /* How many shared memory use per block in CSR Adaptive kernel */
   static constexpr Index SHARED_PER_BLOCK = 24576;
   static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:

   /* Number of elements in shared memory */
   static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
@@ -364,7 +390,7 @@ struct CSRKernelAdaptive
               if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
                  type = details::Type::VECTOR;
               else
                  type = details::Type::VECTOR; // TODO: Put LONG back
                  type = details::Type::LONG;
                  //type = Type::LONG; //
               return current + 1;
            }
@@ -393,7 +419,9 @@ struct CSRKernelAdaptive
            {
               const Index blocksCount = inBlock.size();
               const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
               const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
               Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
               if( warpsLeft == 0 )
                  warpsLeft = warpsPerCudaBlock;
               //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
               inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft );
               for( Index index = 1; index < warpsLeft; index++ )
+5 −0
Original line number Diff line number Diff line
@@ -190,6 +190,11 @@ struct CSRAdaptiveKernelBlockDescriptor
      return this->warpIdx;
   }

   __cuda_callable__ uint8_t getWarpsCount() const
   {
      return this->warpsCount;
   }

   void print( std::ostream& str ) const
   {
      str << "Type: ";
+9 −8
Original line number Diff line number Diff line
@@ -1070,7 +1070,6 @@ void test_VectorProduct()
       outVector_1.setElement( j, 0 );

   m_1.vectorProduct( inVector_1, outVector_1 );

   EXPECT_EQ( outVector_1.getElement( 0 ),  2 );
   EXPECT_EQ( outVector_1.getElement( 1 ), 10 );
   EXPECT_EQ( outVector_1.getElement( 2 ),  8 );
@@ -1310,7 +1309,7 @@ void test_VectorProduct()

   /////
   // Large test
   const IndexType size( 35 );
   const IndexType size( 1051 );
   //for( int size = 1; size < 1000; size++ )
   {
      //std::cerr << " size = " << size << std::endl;
@@ -1338,26 +1337,28 @@ void test_VectorProduct()
         EXPECT_EQ( out.getElement( i ), i + 1 );

      // Test with large triangular matrix
      Matrix m2( size, size );
      rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } );
      const int rows( size ), columns( size );
      Matrix m2( rows, columns );
      rowCapacities.setSize( rows );
      rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } );
      m2.setRowCapacities( rowCapacities );
      auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) {
         if( localIdx <= row )
         {
            value = row -localIdx + 1;
            value = localIdx + 1;
            column = localIdx;
         }
      };
      m2.forAllRows( f2 );
      // check that the matrix was initialized
      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size );
      TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows );
      m2.getCompressedRowLengths( rowLengths );
      EXPECT_EQ( rowLengths, rowCapacities );

      out.setSize( rows );
      out = 0.0;
      m2.vectorProduct( in, out );
      //std::cerr << out << std::endl;
      for( IndexType i = 0; i < size; i++ )
      for( IndexType i = 0; i < rows; i++ )
         EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 );
   }
}
+6 −6
Original line number Diff line number Diff line
@@ -30,12 +30,12 @@ using MatrixTypes = ::testing::Types
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Host, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
#ifdef HAVE_CUDA
   ,TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    //TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, int,   TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< int,     TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< long,    TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< float,   TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
    TNL::Matrices::SparseMatrix< double,  TNL::Devices::Cuda, long,  TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
#endif
>;