Optimization of ParallelFor on CPU. (746fa701) · Commits · TNL / tnl-dev

src/TNL/ParallelFor.h

+50 −11

Original line number	Diff line number	Diff line
		@@ -37,10 +37,21 @@ struct ParallelFor
		static void exec( Index start, Index end, Function f, FunctionArgs... args )
		{
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
		#endif
		// Benchmarks show that this is significantly faster compared
		// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
		if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
		{
		#pragma omp parallel for
		for( Index i = start; i < end; i++ )
		f( i, args... );
		}
		else
		for( Index i = start; i < end; i++ )
		f( i, args... );
		#else
		for( Index i = start; i < end; i++ )
		f( i, args... );
		#endif
		}
		};

		@@ -53,11 +64,24 @@ struct ParallelFor2D
		static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
		{
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
		#endif
		// Benchmarks show that this is significantly faster compared
		// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
		if( TNL::Devices::Host::isOMPEnabled() )
		{
		#pragma omp parallel for
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		f( i, j, args... );
		}
		else
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		f( i, j, args... );
		#else
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		f( i, j, args... );
		#endif
		}
		};

		@@ -70,12 +94,27 @@ struct ParallelFor3D
		static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
		{
		#ifdef HAVE_OPENMP
		#pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() )
		#endif
		// Benchmarks show that this is significantly faster compared
		// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
		if( TNL::Devices::Host::isOMPEnabled() )
		{
		#pragma omp parallel for collapse(2)
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		for( Index k = startZ; k < endZ; k++ )
		f( i, j, k, args... );
		}
		else
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		for( Index k = startZ; k < endZ; k++ )
		f( i, j, k, args... );
		#else
		for( Index i = startX; i < endX; i++ )
		for( Index j = startY; j < endY; j++ )
		for( Index k = startZ; k < endZ; k++ )
		f( i, j, k, args... );
		#endif
		}
		};