Commit a0c48268 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/segments-fix' into 'develop'

Segments: fix for empty matrices

See merge request !115
parents fa61a35c ade448ee
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
         detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
         cudaThreadSynchronize();
         TNL_CHECK_CUDA_DEVICE;
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
}
+2 −0
Original line number Diff line number Diff line
@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
}
+8 −2
Original line number Diff line number Diff line
@@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher
   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
   {
   #ifdef HAVE_CUDA
      if( last <= first )
         return;
      const Index segmentsCount = last - first;
      const Index threadsCount = segmentsCount * 32;
      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
      dim3 blockSize( 256 );
      dim3 gridSize( blocksCount );
      EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
      cudaDeviceSynchronize();
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
   #endif
   }
};
@@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
   exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
   {
   #ifdef HAVE_CUDA
      if( last <= first )
         return;
      const Index segmentsCount = last - first;
      const Index threadsCount = segmentsCount * 32;
      const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
      dim3 blockSize( 256 );
      dim3 gridSize( blocksCount );
      EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
      cudaDeviceSynchronize();
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
   #endif
   }
};
+2 −0
Original line number Diff line number Diff line
@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
               zero,
               args... );
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
};
+7 −0
Original line number Diff line number Diff line
@@ -175,7 +175,10 @@ void
CSRHybridKernel< Index, Device, ThreadsInBlock >::
init( const Offsets& offsets )
{
    TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
    const Index segmentsCount = offsets.getSize() - 1;
    if( segmentsCount <= 0 )
       return;
    const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
    this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
    TNL_ASSERT_GE( threadsPerSegment, 0, "" );
@@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets,
    TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );

#ifdef HAVE_CUDA
    if( last <= first )
       return;
    const size_t threadsCount = this->threadsPerSegment * ( last - first );
    dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
    TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
@@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets,
                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
        }
    }
    cudaStreamSynchronize(0);
    TNL_CHECK_CUDA_DEVICE;
#endif
}

Loading