Added asynchronous mode to ParallelFor. (cd43ce96) · Commits · TNL / tnl-dev

src/TNL/ParallelFor.h

+30 −16

Original line number	Diff line number	Diff line
		@@ -15,7 +15,7 @@
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Math.h>

		/*
		/****
		* The implementation of ParallelFor is not meant to provide maximum performance
		* at every cost, but maximum flexibility for operating with data stored on the
		* device.
		@@ -28,7 +28,10 @@

		namespace TNL {

		template< typename Device = Devices::Host >
		enum ParallelForMode { SynchronousMode, AsynchronousMode };

		template< typename Device = Devices::Host,
		ParallelForMode Mode = SynchronousMode >
		struct ParallelFor
		{
		template< typename Index,
		@@ -55,7 +58,8 @@ struct ParallelFor
		}
		};

		template< typename Device = Devices::Host >
		template< typename Device = Devices::Host,
		ParallelForMode Mode = SynchronousMode >
		struct ParallelFor2D
		{
		template< typename Index,
		@@ -86,7 +90,8 @@ struct ParallelFor2D
		}
		};

		template< typename Device = Devices::Host >
		template< typename Device = Devices::Host,
		ParallelForMode Mode = SynchronousMode >
		struct ParallelFor3D
		{
		template< typename Index,
		@@ -185,8 +190,8 @@ ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index
		}
		#endif

		template<>
		struct ParallelFor< Devices::Cuda >
		template< ParallelForMode Mode >
		struct ParallelFor< Devices::Cuda, Mode >
		{
		template< typename Index,
		typename Function,
		@@ -208,17 +213,20 @@ struct ParallelFor< Devices::Cuda >
		ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
		}

		if( Mode == SynchronousMode )
		{
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		}
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}
		};

		template<>
		struct ParallelFor2D< Devices::Cuda >
		template< ParallelForMode Mode >
		struct ParallelFor2D< Devices::Cuda, Mode >
		{
		template< typename Index,
		typename Function,
		@@ -264,17 +272,20 @@ struct ParallelFor2D< Devices::Cuda >
		ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
		( startX, startY, endX, endY, f, args... );

		if( Mode == SynchronousMode )
		{
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		}
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}
		};

		template<>
		struct ParallelFor3D< Devices::Cuda >
		template< ParallelForMode Mode >
		struct ParallelFor3D< Devices::Cuda, Mode >
		{
		template< typename Index,
		typename Function,
		@@ -343,9 +354,12 @@ struct ParallelFor3D< Devices::Cuda >
		ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
		( startX, startY, startZ, endX, endY, endZ, f, args... );

		if( Mode == SynchronousMode )
		{
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		}
		#else
		throw Exceptions::CudaSupportMissing();
		#endif