Commit ce8f92fc authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added missing TNL_CHECK_CUDA_DEVICE and fixed stream synchronization after segments CUDA kernels

parent fa61a35c
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
         detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
         cudaThreadSynchronize();
         TNL_CHECK_CUDA_DEVICE;
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
}
+2 −0
Original line number Diff line number Diff line
@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
            <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
            ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
}
+4 −2
Original line number Diff line number Diff line
@@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher
      dim3 blockSize( 256 );
      dim3 gridSize( blocksCount );
      EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
      cudaDeviceSynchronize();
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
   #endif
   }
};
@@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
      dim3 blockSize( 256 );
      dim3 gridSize( blocksCount );
      EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
      cudaDeviceSynchronize();
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
   #endif
   }
};
+2 −0
Original line number Diff line number Diff line
@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
               zero,
               args... );
      }
      cudaStreamSynchronize(0);
      TNL_CHECK_CUDA_DEVICE;
#endif
   }
};
+2 −0
Original line number Diff line number Diff line
@@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets,
                throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
        }
    }
    cudaStreamSynchronize(0);
    TNL_CHECK_CUDA_DEVICE;
#endif
}

Loading