Loading CMakeLists.txt +4 −0 Original line number Diff line number Diff line Loading @@ -322,6 +322,10 @@ find_package( PythonInterp 3 ) # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() set( CXX_TEST_FLAGS "-fprofile-arcs -ftest-coverage" ) set( LD_TEST_FLAGS "-lgcov -coverage" ) Loading build +4 −1 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" CMAKE="cmake" CMAKE_ONLY="no" HELP="no" Loading Loading @@ -42,6 +43,7 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading Loading @@ -97,7 +99,8 @@ ${CMAKE} ${ROOT_DIR} \ -DINSTANTIATE_DOUBLE=${INSTANTIATE_DOUBLE} \ -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} \ -DINSTANTIATE_INT=${INSTANTIATE_INT} \ -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} \ -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} if test $? != 0; then echo "Error: cmake exited with error code." Loading src/TNL/Containers/VectorOperationsHost_impl.h +122 −23 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ namespace TNL { namespace Containers { static const int OpenMPVectorOperationsThreshold = 65536; // TODO: check this threshold static const int PrefetchDistance = 128; template< typename Vector > void VectorOperations< Devices::Host >::addElement( Vector& v, Loading Loading @@ -111,9 +112,48 @@ getVectorL2Norm( const Vector& v ) { typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Assert( v. getSize() > 0, ); Real result( 0.0 ); const Index n = v. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] const Real* V = v.getData(); #endif Real result1 = 0, result2 = 0, result3 = 0, result4 = 0; Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \ reduction(+:result1,result2,result3,result4) \ lastprivate(i) #endif for( i = 0; i < unroll_limit; i += 4 ) { #ifdef __GNUC__ __builtin_prefetch(V + i + PrefetchDistance, 0, 0); #endif result1 += v[ i ] * v[ i ]; result2 += v[ i + 1 ] * v[ i + 1 ]; result3 += v[ i + 2 ] * v[ i + 2 ]; result4 += v[ i + 3 ] * v[ i + 3 ]; } while( i < n ) { result1 += v[ i ] * v[ i ]; i++; } return std::sqrt(result1 + result2 + result3 + result4); #else // OPTIMIZED_VECTOR_HOST_OPERATIONS Real result( 0.0 ); #ifdef HAVE_OPENMP #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold #endif Loading @@ -123,6 +163,7 @@ getVectorL2Norm( const Vector& v ) result += aux * aux; } return std::sqrt( result ); #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector > Loading Loading @@ -360,33 +401,55 @@ typename Vector1 :: RealType VectorOperations< Devices::Host > :: getScalarProdu Assert( v1. getSize() > 0, ); Assert( v1. getSize() == v2. getSize(), ); const Index n = v1. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] const Real* V1 = v1.getData(); const Real* V2 = v2.getData(); #endif Real dot1 = 0.0, dot2 = 0.0, dot3 = 0.0, dot4 = 0.0; Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \ reduction(+:dot1,dot2,dot3,dot4) \ lastprivate(i) #endif for( i = 0; i < unroll_limit; i += 4 ) { #ifdef __GNUC__ __builtin_prefetch(V1 + i + PrefetchDistance, 0, 0); __builtin_prefetch(V2 + i + PrefetchDistance, 0, 0); #endif dot1 += v1[ i ] * v2[ i ]; dot2 += v1[ i + 1 ] * v2[ i + 1 ]; dot3 += v1[ i + 2 ] * v2[ i + 2 ]; dot4 += v1[ i + 3 ] * v2[ i + 3 ]; } while( i < n ) { dot1 += v1[ i ] * v2[ i ]; i++; } return dot1 + dot2 + dot3 + dot4; #else // OPTIMIZED_VECTOR_HOST_OPERATIONS Real result( 0.0 ); const Index n = v1. getSize(); #ifdef HAVE_OPENMP #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold #endif for( Index i = 0; i < n; i++ ) result += v1[ i ] * v2[ i ]; /*Real result1( 0.0 ), result2( 0.0 ), result3( 0.0 ), result4( 0.0 ), result5( 0.0 ), result6( 0.0 ), result7( 0.0 ), result8( 0.0 ); Index i( 0 ); while( i + 8 < n ) { result1 += v1[ i ] * v2[ i ]; result2 += v1[ i + 1 ] * v2[ i + 1 ]; result3 += v1[ i + 2 ] * v2[ i + 2 ]; result4 += v1[ i + 3 ] * v2[ i + 3 ]; result5 += v1[ i + 4 ] * v2[ i + 4 ]; result6 += v1[ i + 5 ] * v2[ i + 5 ]; result7 += v1[ i + 6 ] * v2[ i + 6 ]; result8 += v1[ i + 7 ] * v2[ i + 7 ]; i += 8; } Real result = result1 + result2 + result3 + result4 + result5 +result6 +result7 +result8; while( i < n ) result += v1[ i ] * v2[ i++ ];*/ return result; #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector1, typename Vector2 > Loading @@ -400,8 +463,43 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y, Assert( x. getSize() > 0, ); Assert( x. getSize() == y. getSize(), ); const Index n = y. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] Real* Y = y.getData(); const Real* X = x.getData(); #endif Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( n > OpenMPVectorOperationsThreshold ) \ lastprivate(i) #endif for(i = 0; i < unroll_limit; i += 4) { #ifdef __GNUC__ __builtin_prefetch(&y[ i + PrefetchDistance ], 1, 0); __builtin_prefetch(&x[ i + PrefetchDistance ], 0, 0); #endif y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; y[ i + 1 ] = thisMultiplicator * y[ i + 1 ] + alpha * x[ i + 1 ]; y[ i + 2 ] = thisMultiplicator * y[ i + 2 ] + alpha * x[ i + 2 ]; y[ i + 3 ] = thisMultiplicator * y[ i + 3 ] + alpha * x[ i + 3 ]; } while( i < n ) { y[i] = thisMultiplicator * y[ i ] + alpha * x[ i ]; i++; } #else // OPTIMIZED_VECTOR_HOST_OPERATIONS if( thisMultiplicator == 1.0 ) #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold Loading @@ -414,6 +512,7 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y, #endif for( Index i = 0; i < n; i ++ ) y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector1, Loading Loading
CMakeLists.txt +4 −0 Original line number Diff line number Diff line Loading @@ -322,6 +322,10 @@ find_package( PythonInterp 3 ) # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() set( CXX_TEST_FLAGS "-fprofile-arcs -ftest-coverage" ) set( LD_TEST_FLAGS "-lgcov -coverage" ) Loading
build +4 −1 Original line number Diff line number Diff line Loading @@ -13,6 +13,7 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" CMAKE="cmake" CMAKE_ONLY="no" HELP="no" Loading Loading @@ -42,6 +43,7 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; --with-cmake=* ) CMAKE="${option#*=}" ;; --build-jobs=* ) BUILD_JOBS="${option#*=}" ;; --cmake-only=* ) CMAKE_ONLY="${option#*=}" ;; Loading Loading @@ -97,7 +99,8 @@ ${CMAKE} ${ROOT_DIR} \ -DINSTANTIATE_DOUBLE=${INSTANTIATE_DOUBLE} \ -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} \ -DINSTANTIATE_INT=${INSTANTIATE_INT} \ -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} \ -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} if test $? != 0; then echo "Error: cmake exited with error code." Loading
src/TNL/Containers/VectorOperationsHost_impl.h +122 −23 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ namespace TNL { namespace Containers { static const int OpenMPVectorOperationsThreshold = 65536; // TODO: check this threshold static const int PrefetchDistance = 128; template< typename Vector > void VectorOperations< Devices::Host >::addElement( Vector& v, Loading Loading @@ -111,9 +112,48 @@ getVectorL2Norm( const Vector& v ) { typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Assert( v. getSize() > 0, ); Real result( 0.0 ); const Index n = v. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] const Real* V = v.getData(); #endif Real result1 = 0, result2 = 0, result3 = 0, result4 = 0; Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \ reduction(+:result1,result2,result3,result4) \ lastprivate(i) #endif for( i = 0; i < unroll_limit; i += 4 ) { #ifdef __GNUC__ __builtin_prefetch(V + i + PrefetchDistance, 0, 0); #endif result1 += v[ i ] * v[ i ]; result2 += v[ i + 1 ] * v[ i + 1 ]; result3 += v[ i + 2 ] * v[ i + 2 ]; result4 += v[ i + 3 ] * v[ i + 3 ]; } while( i < n ) { result1 += v[ i ] * v[ i ]; i++; } return std::sqrt(result1 + result2 + result3 + result4); #else // OPTIMIZED_VECTOR_HOST_OPERATIONS Real result( 0.0 ); #ifdef HAVE_OPENMP #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold #endif Loading @@ -123,6 +163,7 @@ getVectorL2Norm( const Vector& v ) result += aux * aux; } return std::sqrt( result ); #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector > Loading Loading @@ -360,33 +401,55 @@ typename Vector1 :: RealType VectorOperations< Devices::Host > :: getScalarProdu Assert( v1. getSize() > 0, ); Assert( v1. getSize() == v2. getSize(), ); const Index n = v1. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] const Real* V1 = v1.getData(); const Real* V2 = v2.getData(); #endif Real dot1 = 0.0, dot2 = 0.0, dot3 = 0.0, dot4 = 0.0; Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) \ reduction(+:dot1,dot2,dot3,dot4) \ lastprivate(i) #endif for( i = 0; i < unroll_limit; i += 4 ) { #ifdef __GNUC__ __builtin_prefetch(V1 + i + PrefetchDistance, 0, 0); __builtin_prefetch(V2 + i + PrefetchDistance, 0, 0); #endif dot1 += v1[ i ] * v2[ i ]; dot2 += v1[ i + 1 ] * v2[ i + 1 ]; dot3 += v1[ i + 2 ] * v2[ i + 2 ]; dot4 += v1[ i + 3 ] * v2[ i + 3 ]; } while( i < n ) { dot1 += v1[ i ] * v2[ i ]; i++; } return dot1 + dot2 + dot3 + dot4; #else // OPTIMIZED_VECTOR_HOST_OPERATIONS Real result( 0.0 ); const Index n = v1. getSize(); #ifdef HAVE_OPENMP #pragma omp parallel for reduction(+:result) if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold #endif for( Index i = 0; i < n; i++ ) result += v1[ i ] * v2[ i ]; /*Real result1( 0.0 ), result2( 0.0 ), result3( 0.0 ), result4( 0.0 ), result5( 0.0 ), result6( 0.0 ), result7( 0.0 ), result8( 0.0 ); Index i( 0 ); while( i + 8 < n ) { result1 += v1[ i ] * v2[ i ]; result2 += v1[ i + 1 ] * v2[ i + 1 ]; result3 += v1[ i + 2 ] * v2[ i + 2 ]; result4 += v1[ i + 3 ] * v2[ i + 3 ]; result5 += v1[ i + 4 ] * v2[ i + 4 ]; result6 += v1[ i + 5 ] * v2[ i + 5 ]; result7 += v1[ i + 6 ] * v2[ i + 6 ]; result8 += v1[ i + 7 ] * v2[ i + 7 ]; i += 8; } Real result = result1 + result2 + result3 + result4 + result5 +result6 +result7 +result8; while( i < n ) result += v1[ i ] * v2[ i++ ];*/ return result; #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector1, typename Vector2 > Loading @@ -400,8 +463,43 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y, Assert( x. getSize() > 0, ); Assert( x. getSize() == y. getSize(), ); const Index n = y. getSize(); #ifdef OPTIMIZED_VECTOR_HOST_OPERATIONS #ifdef __GNUC__ // We need to get the address of the first element to avoid // bounds checking in TNL::Array::operator[] Real* Y = y.getData(); const Real* X = x.getData(); #endif Index i = 0; const Index unroll_limit = n - n % 4; #ifdef HAVE_OPENMP #pragma omp parallel for \ if( n > OpenMPVectorOperationsThreshold ) \ lastprivate(i) #endif for(i = 0; i < unroll_limit; i += 4) { #ifdef __GNUC__ __builtin_prefetch(&y[ i + PrefetchDistance ], 1, 0); __builtin_prefetch(&x[ i + PrefetchDistance ], 0, 0); #endif y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; y[ i + 1 ] = thisMultiplicator * y[ i + 1 ] + alpha * x[ i + 1 ]; y[ i + 2 ] = thisMultiplicator * y[ i + 2 ] + alpha * x[ i + 2 ]; y[ i + 3 ] = thisMultiplicator * y[ i + 3 ] + alpha * x[ i + 3 ]; } while( i < n ) { y[i] = thisMultiplicator * y[ i ] + alpha * x[ i ]; i++; } #else // OPTIMIZED_VECTOR_HOST_OPERATIONS if( thisMultiplicator == 1.0 ) #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold ) // TODO: check this threshold Loading @@ -414,6 +512,7 @@ void VectorOperations< Devices::Host > :: addVector( Vector1& y, #endif for( Index i = 0; i < n; i ++ ) y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; #endif // OPTIMIZED_VECTOR_HOST_OPERATIONS } template< typename Vector1, Loading