Commit 5c983e49 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

[WIP] Rewritting reduction using lambda functions.

parent c60aa9e4
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -220,9 +220,10 @@ containsValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
   reductionContainsValue.setValue( value );
   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
   //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
   //reductionContainsValue.setValue( value );
   auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
   return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0  );
}

template< typename Element,
+59 −53
Original line number Diff line number Diff line
@@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi

template< int blockSize,
   typename Real,
   typename FirstPhase,
   typename SecondPhase,
   typename DataFetcher,
   typename Reduction,
   typename Index,
   typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) >
   typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( const Real& initialValue,
                     FirstReduction& firstReduction,
                     SecondReduction& secondReduction,
CudaReductionKernel( const Real& zero,
                     const DataFetcher& dataFetcher,
                     const Reduction& reduction,
                     const Index size,
                     ResultType* output )
                     Result* output )
{
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;
   using RealType = Real;
   using IndexType = Index;
   using ResultType = Result;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

@@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue,
         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   sdata[ tid ] = initialValue;
   sdata[ tid ] = zero;
   /***
    * Read data into the shared memory. We start with the
    * sequential reduction.
    */
   while( gid + 4 * gridSize < size )
   {
      sdata[ tid ] = firstReduction( sdata[ tid ], gid );
      sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize );
      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize );
      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < size )
   {
      firstReduction( sdata[ tid ], gid,                input1, input2 );
      firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
      gid += 2 * gridSize;
   }
   while( gid < size )
   {
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      sdata[ tid ] =  reduction( sdata[ tid ], dataFetcher( gid ) );
      gid += gridSize;
   }
   __syncthreads();
@@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue,
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
         reduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
         reduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
         reduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue,
   if( blockSize >= 128 )
   {
      if( tid <  64 )
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
         reduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -137,34 +138,34 @@ CudaReductionKernel( const Real& initialValue,
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      // TODO: If blocksize == 32, the following does not work
      // We do not check if tid < 16. Fix it!!!
      if( blockSize >= 32 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }
@@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue,

}

template< typename Operation, typename Index >
template< typename Real,
   typename DataFetcher,
   typename Reduction,
   typename Index,
   typename Result >
int
CudaReductionKernelLauncher( Operation& operation,
                             const Index size,
                             const typename Operation::DataType1* input1,
                             const typename Operation::DataType2* input2,
                             typename Operation::ResultType*& output )
CudaReductionKernelLauncher( const Index size,
                             const Reduction& reduction,
                             const DataFetcher& dataFetcher,
                             const Real& zero,
                             Result*& output )
{
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;
   using RealType = Real;
   using IndexType = Index;
   using ResultType = Result;

   // The number of blocks should be a multiple of the number of multiprocessors
   // to ensure optimum balancing of the load. This is very important, because
@@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation,
   {
      case 512:
         CudaReductionKernel< 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<   4 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case   2:
         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);

         CudaReductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
         break;
      case   1:
         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+29 −20
Original line number Diff line number Diff line
@@ -29,36 +29,45 @@ template<>
class Reduction< Devices::Cuda >
{
public:
   template< typename Operation, typename Index >
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2 );
   template< typename Index,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
           DataFetcher& dataFetcher,
           const Result& zero );
};

template<>
class Reduction< Devices::Host >
{
public:
   template< typename Operation, typename Index >
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2 );
   template< typename Index,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
           DataFetcher& dataFetcher,
           const Result& zero );
};

template<>
class Reduction< Devices::MIC >
{
public:
   template< typename Operation, typename Index >
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2 );
   template< typename Index,
             typename ReductionOperation,
             typename DataFetcher,
             typename Result = decltype( DataFetcher::operator() ) >
   static Result
   reduce( const Index size,
           ReductionOperation& reduction,
           DataFetcher& dataFetcher,
           const Result& zero );
};

} // namespace Algorithms
+76 −64
Original line number Diff line number Diff line
@@ -38,35 +38,37 @@ namespace Algorithms {
 */
static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;

template< typename Operation, typename Index >
typename Operation::ResultType

template< typename Index,
          typename ReductionOperation,
          typename DataFetcher,
          typename Result >
Result
Reduction< Devices::Cuda >::
reduce( Operation& operation,
        const Index size,
        const typename Operation::DataType1* deviceInput1,
        const typename Operation::DataType2* deviceInput2 )
   reduce( const Index size,
           ReductionOperation& reduction,
           DataFetcher& dataFetcher,
           const Result& zero )
{
#ifdef HAVE_CUDA

   typedef Index IndexType;
   typedef typename Operation::DataType1 DataType1;
   typedef typename Operation::DataType2 DataType2;
   typedef typename Operation::ResultType ResultType;
   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
   using IndexType = Index;
   using ResultType = Result;

   /***
    * Only fundamental and pointer types can be safely reduced on host. Complex
    * objects stored on the device might contain pointers into the device memory,
    * in which case reduction on host might fail.
    */
   constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;

   /***
    * First check if the input array(s) is/are large enough for the reduction on GPU.
    * Otherwise copy it/them to host and reduce on CPU.
    */
   if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
   // With lambda function we cannot reduce data on host - we do not know the data here.
   /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
   {
      typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
@@ -74,12 +76,12 @@ reduce( Operation& operation,
         using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
         typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
         ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
         return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 );
      }
      else {
         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
      }
   }
   }*/

   #ifdef CUDA_REDUCTION_PROFILING
      Timer timer;
@@ -91,10 +93,10 @@ reduce( Operation& operation,
    * Reduce the data on the CUDA device.
    */
   ResultType* deviceAux1( 0 );
   IndexType reducedSize = CudaReductionKernelLauncher( operation,
                                                        size,
                                                        deviceInput1,
                                                        deviceInput2,
   IndexType reducedSize = CudaReductionKernelLauncher( size,
                                                        reduction,
                                                        dataFetcher,
                                                        zero,
                                                        deviceAux1 );
   #ifdef CUDA_REDUCTION_PROFILING
      timer.stop();
@@ -120,8 +122,8 @@ reduce( Operation& operation,
      /***
       * Reduce the data on the host system.
       */
      LaterReductionOperation laterReductionOperation;
      const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );
      auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; };
      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero );

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
@@ -134,12 +136,13 @@ reduce( Operation& operation,
      /***
       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
       */
      LaterReductionOperation laterReductionOperation;
      //LaterReductionOperation laterReductionOperation;
      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
      while( reducedSize > 1 ) {
         reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
                                                    reducedSize,
                                                    deviceAux1,
                                                    (ResultType*) 0,
         reducedSize = CudaReductionKernelLauncher( reducedSize,
                                                    reduction,
                                                    copyFetch,
                                                    zero,
                                                    deviceAux1 );
      }

@@ -166,18 +169,19 @@ reduce( Operation& operation,
#endif
};

template< typename Operation, typename Index >
typename Operation::ResultType
template< typename Index,
          typename ReductionOperation,
          typename DataFetcher,
          typename Result >
Result
Reduction< Devices::Host >::
reduce( Operation& operation,
        const Index size,
        const typename Operation::DataType1* input1,
        const typename Operation::DataType2* input2 )
   reduce( const Index size,
           ReductionOperation& reduction,
           DataFetcher& dataFetcher,
           const Result& zero )
{
   typedef Index IndexType;
   typedef typename Operation::DataType1 DataType1;
   typedef typename Operation::DataType2 DataType2;
   typedef typename Operation::ResultType ResultType;
   using IndexType = Index;
   using ResultType = Result;

   constexpr int block_size = 128;
   const int blocks = size / block_size;
@@ -185,23 +189,24 @@ reduce( Operation& operation,
#ifdef HAVE_OPENMP
   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
      // global result variable
      ResultType result = operation.initialValue();
      ResultType result = zero;
#pragma omp parallel
      {
         // initialize array for thread-local results
         ResultType r[ 4 ] = { operation.initialValue(),
                               operation.initialValue(),
                               operation.initialValue(),
                               operation.initialValue() };
         ResultType r[ 4 ] = { zero, zero, zero, zero  };

         #pragma omp for nowait
         for( int b = 0; b < blocks; b++ ) {
            const IndexType offset = b * block_size;
            for( int i = 0; i < block_size; i += 4 ) {
               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
               /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
            }
         }

@@ -209,18 +214,19 @@ reduce( Operation& operation,
         #pragma omp single nowait
         {
            for( IndexType i = blocks * block_size; i < size; i++ )
               operation.firstReduction( r[ 0 ], i, input1, input2 );
               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
               //operation.dataFetcher( r[ 0 ], i, input1, input2 );
         }

         // local reduction of unrolled results
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 1 ], r[ 3 ] );
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );

         // inter-thread reduction of local results
         #pragma omp critical
         {
            operation.commonReduction( result, r[ 0 ] );
            result = reduction( result, r[ 0 ] );
         }
      }
      return result;
@@ -229,37 +235,43 @@ reduce( Operation& operation,
#endif
      if( blocks > 1 ) {
         // initialize array for unrolled results
         ResultType r[ 4 ] = { operation.initialValue(),
                               operation.initialValue(),
                               operation.initialValue(),
                               operation.initialValue() };
         ResultType r[ 4 ] = { zero, zero, zero, zero };

         // main reduction (explicitly unrolled loop)
         for( int b = 0; b < blocks; b++ ) {
            const IndexType offset = b * block_size;
            for( int i = 0; i < block_size; i += 4 ) {
               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
               /*operation.dataFetcher( r[ 0 ], offset + i,     input1, input2 );
               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
            }
         }

         // reduction of the last, incomplete block (not unrolled)
         for( IndexType i = blocks * block_size; i < size; i++ )
            operation.firstReduction( r[ 0 ], i, input1, input2 );
            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
            //operation.dataFetcher( r[ 0 ], i, input1, input2 );

         // reduction of unrolled results
         operation.commonReduction( r[ 0 ], r[ 2 ] );
         operation.commonReduction( r[ 1 ], r[ 3 ] );
         operation.commonReduction( r[ 0 ], r[ 1 ] );
         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );

         /*operation.reduction( r[ 0 ], r[ 2 ] );
         operation.reduction( r[ 1 ], r[ 3 ] );
         operation.reduction( r[ 0 ], r[ 1 ] );*/

         return r[ 0 ];
      }
      else {
         ResultType result = operation.initialValue();
         ResultType result = zero;
         for( IndexType i = 0; i < size; i++ )
            operation.firstReduction( result, i, input1, input2 );
            result = reduction( result, dataFetcher( i ) );
         return result;
      }
#ifdef HAVE_OPENMP
+18 −6

File changed.

Preview size limit exceeded, changes collapsed.

Loading