Commit fc12a48d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Tomáš Oberhuber
Browse files

And now the real optimization of the Adaptive CSR kernel initiation :).

parent bf4dc990
Loading
Loading
Loading
Loading
+45 −50
Original line number Diff line number Diff line
@@ -119,7 +119,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
      const Index segmentIdx = block.getFirstSegment();//block.index[0];
      //minID = offsets[block.index[0] ];
      const Index end = offsets[segmentIdx + 1];
      const int tid = threadIdx.x;
      //const int tid = threadIdx.x;
      //const int inBlockWarpIdx = block.getWarpIdx();

      //if( to > end )
@@ -382,11 +382,9 @@ struct CSRKernelAdaptive
                    Index &sum )
   {
      sum = 0;
      TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >
         hostOffsets( offsets );
      for (Index current = start; current < size - 1; current++ )
      {
         Index elements = hostOffsets[ current + 1 ] - hostOffsets[ current ];
         Index elements = offsets[ current + 1 ] - offsets[ current ];
         sum += elements;
         if( sum > SHARED_PER_WARP )
         {
@@ -401,7 +399,6 @@ struct CSRKernelAdaptive
                  type = details::Type::VECTOR;
               else
                  type = details::Type::LONG;
                  //type = Type::LONG; //
               return current + 1;
            }
         }
@@ -413,6 +410,8 @@ struct CSRKernelAdaptive
   template< typename Offsets >
   void init( const Offsets& offsets )
   {
      using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
      HostOffsetsType hostOffsets( offsets );
      const Index rows = offsets.getSize();
      Index sum, start( 0 ), nextStart( 0 );

@@ -423,7 +422,7 @@ struct CSRKernelAdaptive
      while( nextStart != rows - 1 )
      {
         details::Type type;
            nextStart = findLimit( start, offsets, rows, type, sum );
         nextStart = findLimit( start, hostOffsets, rows, type, sum );

         if( type == details::Type::LONG )
         {
@@ -432,13 +431,9 @@ struct CSRKernelAdaptive
            Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
            if( warpsLeft == 0 )
               warpsLeft = warpsPerCudaBlock;
               //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
               inBlocks.emplace_back( start, details::Type::LONG, 0, warpsLeft );
               for( Index index = 1; index < warpsLeft; index++ )
               {
            for( Index index = 0; index < warpsLeft; index++ )
               inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
         }
            }
         else
         {
            inBlocks.emplace_back(start, type,
@@ -451,7 +446,7 @@ struct CSRKernelAdaptive
      inBlocks.emplace_back(nextStart);
      this->blocks = inBlocks;
      this->view.setBlocks( blocks );
    };
   }

   void reset()
   {