Commit 81232c66 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Moved CudaMultireductionKernel, CudaReductionKernel and CudaScanKernel into a 'detail' namespace

The kernels are not interesting for users and Doxygen made a mess out of
various CUDA declarations. The 'detail' namespace is considered internal
and excluded from processing by Doxygen.
parent 3c7d5d42
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@
#include <TNL/Assert.h>
#include <TNL/Algorithms/Multireduction.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Algorithms/CudaMultireductionKernel.h>
#include <TNL/Algorithms/detail/CudaMultireductionKernel.h>

#ifdef CUDA_REDUCTION_PROFILING
#include <TNL/Timer.h>
@@ -212,7 +212,7 @@ reduce( const Result zero,

   // start the reduction on the GPU
   Result* deviceAux1 = nullptr;
   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );

   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
+3 −3
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@
//#define CUDA_REDUCTION_PROFILING

#include <TNL/Algorithms/Reduction.h>
#include <TNL/Algorithms/CudaReductionKernel.h>
#include <TNL/Algorithms/detail/CudaReductionKernel.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>

#ifdef CUDA_REDUCTION_PROFILING
@@ -311,7 +311,7 @@ reduce( const Index begin,
      timer.start();
   #endif

   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );

   // start the reduce on the GPU
   Result* deviceAux1( 0 );
@@ -401,7 +401,7 @@ reduceWithArgument( const Index begin,
      timer.start();
   #endif

   CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );
   detail::CudaReductionKernelLauncher< Index, Result > reductionLauncher( begin, end );

   // start the reduce on the GPU
   Result* deviceAux1( nullptr );
+4 −4
Original line number Diff line number Diff line
@@ -17,7 +17,7 @@
#include <TNL/Assert.h>
#include <TNL/Containers/Array.h>
#include <TNL/Containers/StaticArray.h>
#include <TNL/Algorithms/CudaScanKernel.h>
#include <TNL/Algorithms/detail/CudaScanKernel.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/NotImplementedError.h>

@@ -227,7 +227,7 @@ perform( Vector& v,
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
@@ -253,7 +253,7 @@ performFirstPhase( Vector& v,
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   return CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
      end - begin,
      &v.getData()[ begin ],  // input
      &v.getData()[ begin ],  // output
@@ -281,7 +281,7 @@ performSecondPhase( Vector& v,
   using RealType = typename Vector::RealType;
   using IndexType = typename Vector::IndexType;

   CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
      end - begin,
      &v.getData()[ begin ],  // output
      blockShifts.getData(),
+2 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@

namespace TNL {
namespace Algorithms {
namespace detail {

#ifdef HAVE_CUDA
/****
@@ -281,5 +282,6 @@ CudaMultireductionKernelLauncher( const Result zero,
#endif
}

} // namespace detail
} // namespace Algorithms
} // namespace TNL
+2 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@

namespace TNL {
namespace Algorithms {
namespace detail {

/****
 * The performance of this kernel is very sensitive to register usage.
@@ -642,5 +643,6 @@ struct CudaReductionKernelLauncher
      Index reducedSize;
};

} // namespace detail
} // namespace Algorithms
} // namespace TNL
Loading