Commit c72fe76b authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

General parallel reduction on host

parent 5d9bcae2
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -203,7 +203,7 @@ compareMemory( const Element1* destination,
   //TODO: The parallel reduction on the CUDA device with different element types is needed.
   bool result = false;
   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   reductionOnCudaDevice( reductionEqualities, size, destination, source, result );
   Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result );
   return result;
}

+17 −17
Original line number Diff line number Diff line
@@ -84,21 +84,21 @@ CudaMultireductionKernel( Operation operation,
   sdata[ tid ] = operation.initialValue();
   while( gid + 4 * gridSizeX < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
      gid += 4 * gridSizeX;
   }
   while( gid + 2 * gridSizeX < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      gid += 2 * gridSizeX;
   }
   while( gid < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      gid += gridSizeX;
   }
   __syncthreads();
@@ -111,25 +111,25 @@ CudaMultireductionKernel( Operation operation,
    */
   if( blockSizeX >= 1024 ) {
      if( threadIdx.x < 512 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 512 ) {
      if( threadIdx.x < 256 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 256 ) {
      if( threadIdx.x < 128 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 128 ) {
      if( threadIdx.x <  64 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
      }
      __syncthreads();
   }
@@ -144,22 +144,22 @@ CudaMultireductionKernel( Operation operation,
   if( threadIdx.x < 32 ) {
      volatile ResultType* vsdata = sdata;
      if( blockSizeX >= 64 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
      }
      if( blockSizeX >= 32 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
      }
      if( blockSizeX >= 16 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
      }
      if( blockSizeX >=  8 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
      }
      if( blockSizeX >=  4 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
      }
      if( blockSizeX >=  2 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
      }
   }

+17 −17
Original line number Diff line number Diff line
@@ -69,21 +69,21 @@ CudaReductionKernel( Operation operation,
    */
   while( gid + 4 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      gid += 2 * gridSize;
   }
   while( gid < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      gid += gridSize;
   }
   __syncthreads();
@@ -98,19 +98,19 @@ CudaReductionKernel( Operation operation,
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -118,7 +118,7 @@ CudaReductionKernel( Operation operation,
   if( blockSize >= 128 )
   {
      if( tid <  64 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -132,34 +132,34 @@ CudaReductionKernel( Operation operation,
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      // TODO: If blocksize == 32, the following does not work
      // We do not check if tid < 16. Fix it!!!
      if( blockSize >= 32 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }
+5 −5
Original line number Diff line number Diff line
@@ -206,7 +206,7 @@ reduce( Operation& operation,
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
            for( IndexType i = 0; i < block_size; i++ )
               r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 );
               operation.firstReduction( r[ k ], offset + i, _input1, input2 );
         }
      }

@@ -216,7 +216,7 @@ reduce( Operation& operation,
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
            for( IndexType i = blocks * block_size; i < size; i++ )
               r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 );
               operation.firstReduction( r[ k ], i, _input1, input2 );
         }
      }

@@ -224,7 +224,7 @@ reduce( Operation& operation,
      #pragma omp critical
      {
         for( int k = 0; k < n; k++ )
            operation.commonReductionOnDevice( result[ k ], r[ k ] );
            operation.commonReduction( result[ k ], r[ k ] );
      }
   }
   else {
@@ -237,14 +237,14 @@ reduce( Operation& operation,
         for( int k = 0; k < n; k++ ) {
            const DataType1* _input1 = input1 + k * ldInput1;
            for( IndexType i = 0; i < block_size; i++ )
               result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 );
               operation.firstReduction( result[ k ], offset + i, _input1, input2 );
         }
      }

      for( int k = 0; k < n; k++ ) {
         const DataType1* _input1 = input1 + k * ldInput1;
         for( IndexType i = blocks * block_size; i < size; i++ )
            result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 );
            operation.firstReduction( result[ k ], i, _input1, input2 );
      }
#ifdef HAVE_OPENMP
   }
+50 −15
Original line number Diff line number Diff line
@@ -8,29 +8,64 @@

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Tomas Oberhuber, Jakub Klinkovsky

#pragma once 

#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/MIC.h>

namespace TNL {
namespace Containers {
namespace Algorithms {   

// TODO: rename to
//   template< typename Device >
//   class Reduction
//   {};
//
// and make a specialization for Devices::Host (as it is done in Multireduction.h)
// It should be as fast as all the manual implementations in VectorOperations.
template< typename Device >
class Reduction
{
};

template<>
class Reduction< Devices::Cuda >
{
public:
   template< typename Operation, typename Index >
   static bool
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
};

template<>
class Reduction< Devices::Host >
{
public:
   template< typename Operation, typename Index >
   static bool
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
};

template<>
class Reduction< Devices::MIC >
{
public:
   template< typename Operation, typename Index >
bool reductionOnCudaDevice( const Operation& operation,
   static bool
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
};

} // namespace Algorithms
} // namespace Containers
} // namespace TNL

#include <TNL/Containers/Algorithms/Reduction_impl.h>
#include "Reduction_impl.h"
Loading