Commit 4c2bccf2 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Optimized vector operations on host - prefetching and unrolling

parent d2a4ec4d
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -322,6 +322,10 @@ find_package( PythonInterp 3 )
#   endif()
#endif()

if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
   AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
endif()

set( CXX_TEST_FLAGS "-fprofile-arcs -ftest-coverage" )
set( LD_TEST_FLAGS "-lgcov -coverage" )

+4 −1
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@ INSTANTIATE_INT="yes"
INSTANTIATE_LONG_DOUBLE="no"
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
OPTIMIZED_VECTOR_HOST_OPERATIONS="no"
CMAKE="cmake"
CMAKE_ONLY="no"
HELP="no"
@@ -42,6 +43,7 @@ do
                                           INSTANTIATE_DOUBLE="yes"
                                           INSTANTIATE_FLOAT="no"
                                           WITH_CUDA_ARCH="auto" ;;
        --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
        --with-cmake=*                   ) CMAKE="${option#*=}" ;;
        --build-jobs=*                   ) BUILD_JOBS="${option#*=}" ;;
        --cmake-only=*                   ) CMAKE_ONLY="${option#*=}" ;;
@@ -97,7 +99,8 @@ ${CMAKE} ${ROOT_DIR} \
         -DINSTANTIATE_DOUBLE=${INSTANTIATE_DOUBLE} \
         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} \
         -DINSTANTIATE_INT=${INSTANTIATE_INT} \
         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} \
         -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}

if test $? != 0; then
    echo "Error: cmake exited with error code."
+122 −23
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@ namespace TNL {
namespace Containers {   

static const int OpenMPVectorOperationsThreshold = 65536; // TODO: check this threshold
static const int PrefetchDistance = 128;

template< typename Vector >
void VectorOperations< Devices::Host >::addElement( Vector& v,
@@ -111,9 +112,48 @@ getVectorL2Norm( const Vector& v )
{
   typedef typename Vector :: RealType Real;
   typedef typename Vector :: IndexType Index;

   Assert( v. getSize() > 0, );
   Real result( 0.0 );
   const Index n = v. getSize();

#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
#ifdef __GNUC__
   // We need to get the address of the first element to avoid
   // bounds checking in TNL::Array::operator[]
   const Real* V = v.getData();
#endif

   Real result1 = 0, result2 = 0, result3 = 0, result4 = 0;
   Index i = 0;
   const Index unroll_limit = n - n % 4;
#ifdef HAVE_OPENMP
#pragma omp parallel for \
       if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \
       reduction(+:result1,result2,result3,result4) \
       lastprivate(i)
#endif
   for( i = 0; i < unroll_limit; i += 4 )
   {
#ifdef __GNUC__
      __builtin_prefetch(V + i + PrefetchDistance, 0, 0);
#endif
      result1 += v[ i ] * v[ i ];
      result2 += v[ i + 1 ] * v[ i + 1 ];
      result3 += v[ i + 2 ] * v[ i + 2 ];
      result4 += v[ i + 3 ] * v[ i + 3 ];
   }

   while( i < n )
   {
      result1 += v[ i ] * v[ i ];
      i++;
   }

   return std::sqrt(result1 + result2 + result3 + result4);

#else // OPTIMIZED_VECTOR_HOST_OPERATIONS

   Real result( 0.0 );
#ifdef HAVE_OPENMP
#pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
#endif
@@ -123,6 +163,7 @@ getVectorL2Norm( const Vector& v )
      result += aux * aux;
   }
   return std::sqrt( result );
#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
}

template< typename Vector >
@@ -360,33 +401,55 @@ typename Vector1 :: RealType VectorOperations< Devices::Host > :: getScalarProdu

   Assert( v1. getSize() > 0, );
   Assert( v1. getSize() == v2. getSize(), );
   const Index n = v1. getSize();

#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
#ifdef __GNUC__
   // We need to get the address of the first element to avoid
   // bounds checking in TNL::Array::operator[]
   const Real* V1 = v1.getData();
   const Real* V2 = v2.getData();
#endif

   Real dot1 = 0.0, dot2 = 0.0, dot3 = 0.0, dot4 = 0.0;
   Index i = 0;
   const Index unroll_limit = n - n % 4;
#ifdef HAVE_OPENMP
   #pragma omp parallel for \
      if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \
      reduction(+:dot1,dot2,dot3,dot4) \
      lastprivate(i)
#endif
   for( i = 0; i < unroll_limit; i += 4 )
   {
#ifdef __GNUC__
      __builtin_prefetch(V1 + i + PrefetchDistance, 0, 0);
      __builtin_prefetch(V2 + i + PrefetchDistance, 0, 0);
#endif
      dot1 += v1[ i ]     * v2[ i ];
      dot2 += v1[ i + 1 ] * v2[ i + 1 ];
      dot3 += v1[ i + 2 ] * v2[ i + 2 ];
      dot4 += v1[ i + 3 ] * v2[ i + 3 ];
   }

   while( i < n )
   {
      dot1 += v1[ i ] * v2[ i ];
      i++;
   }

   return dot1 + dot2 + dot3 + dot4;

#else // OPTIMIZED_VECTOR_HOST_OPERATIONS

   Real result( 0.0 );
   const Index n = v1. getSize();
#ifdef HAVE_OPENMP
   #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
#endif
   for( Index i = 0; i < n; i++ )
      result += v1[ i ] * v2[ i ];
   /*Real result1( 0.0 ), result2( 0.0 ), result3( 0.0 ), result4( 0.0 ),
        result5( 0.0 ), result6( 0.0 ), result7( 0.0 ), result8( 0.0 );
   Index i( 0 );
   while( i + 8 < n )
   {
      result1 += v1[ i ] * v2[ i ];
      result2 += v1[ i + 1 ] * v2[ i + 1 ];
      result3 += v1[ i + 2 ] * v2[ i + 2 ];
      result4 += v1[ i + 3 ] * v2[ i + 3 ];
      result5 += v1[ i + 4 ] * v2[ i + 4 ];
      result6 += v1[ i + 5 ] * v2[ i + 5 ];
      result7 += v1[ i + 6 ] * v2[ i + 6 ];
      result8 += v1[ i + 7 ] * v2[ i + 7 ];
      i += 8;
   }
   Real result = result1 + result2 + result3 + result4 + result5 +result6 +result7 +result8;
   while( i < n )
      result += v1[ i ] * v2[ i++ ];*/
   return result;
#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
}

template< typename Vector1, typename Vector2 >
@@ -400,8 +463,43 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y,

   Assert( x. getSize() > 0, );
   Assert( x. getSize() == y. getSize(), );

   const Index n = y. getSize();

#ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS
#ifdef __GNUC__
   // We need to get the address of the first element to avoid
   // bounds checking in TNL::Array::operator[]
         Real* Y = y.getData();
   const Real* X = x.getData();
#endif

   Index i = 0;
   const Index unroll_limit = n - n % 4;
#ifdef HAVE_OPENMP
   #pragma omp parallel for \
      if( n > OpenMPVectorOperationsThreshold ) \
      lastprivate(i)
#endif
   for(i = 0; i < unroll_limit; i += 4)
   {
#ifdef __GNUC__
      __builtin_prefetch(&y[ i + PrefetchDistance ], 1, 0);
      __builtin_prefetch(&x[ i + PrefetchDistance ], 0, 0);
#endif
      y[ i ]     = thisMultiplicator * y[ i ]     + alpha * x[ i ];
      y[ i + 1 ] = thisMultiplicator * y[ i + 1 ] + alpha * x[ i + 1 ];
      y[ i + 2 ] = thisMultiplicator * y[ i + 2 ] + alpha * x[ i + 2 ];
      y[ i + 3 ] = thisMultiplicator * y[ i + 3 ] + alpha * x[ i + 3 ];
   }

   while( i < n )
   {
      y[i] = thisMultiplicator * y[ i ] + alpha * x[ i ];
      i++;
   }

#else // OPTIMIZED_VECTOR_HOST_OPERATIONS

   if( thisMultiplicator == 1.0 )
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold
@@ -414,6 +512,7 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y,
#endif
      for( Index i = 0; i < n; i ++ )
         y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
#endif // OPTIMIZED_VECTOR_HOST_OPERATIONS
}

template< typename Vector1,