Added tutorial on for loops. (8c001dd6) · Commits · TNL / tnl-dev

Documentation/Tutorials/CMakeLists.txt

+2 −1

Original line number	Diff line number	Diff line
		add_subdirectory( Arrays )
		add_subdirectory( Vectors )
		add_subdirectory( ReductionAndScan )
		add_subdirectory( ForLoops )

Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp

0 → 100644

+20 KiB

File added.

No diff preview for this file type.

View file

Documentation/Tutorials/ForLoops/CMakeLists.txt

0 → 100644

+9 −0

Original line number	Diff line number	Diff line
		IF( BUILD_CUDA )
		CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu )
		ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
		ENDIF()

		IF( BUILD_CUDA )
		ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS
		ParallelForExample.out )
		ENDIF()

Documentation/Tutorials/ForLoops/ParallelForExample.cpp

0 → 100644

+58 −0

Original line number	Diff line number	Diff line
		#include <iostream>
		#include <cstdlib>
		#include <TNL/Containers/Vector.h>

		using namespace TNL;
		using namespace TNL::Containers;
		using namespace TNL::Containers::Algorithms;

		template< typename Device >
		void vectorSum( const Vector< double, Device >& v1,
		const Vector< double, Device >& v2,
		const double& c,
		Vector< double, Device >& result )
		{
		/****
		* Get vectors view which can be captured by lambda.
		*/
		auto v1_view = v1.getConstView();
		auto v2_view = v2.getConstView();
		auto result_view = result.getView();

		/****
		* The sum function.
		*/
		auto sum = [=] __cuda_callable__ ( int i, const double c ) mutable {
		result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; };

		ParallelFor< Device >::exec( 0, v1.getSize(), sum, c );
		}

		int main( int argc, char* argv[] )
		{
		/***
		* Firstly, test the vectors sum on CPU.
		*/
		Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 );
		host_v1 = 1.0;
		host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
		vectorSum( host_v1, host_v2, 2.0, host_result );
		std::cout << "host_v1 = " << host_v1 << std::endl;
		std::cout << "host_v2 = " << host_v2 << std::endl;
		std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl;

		/***
		* And then also on GPU.
		*/
		#ifdef HAVE_CUDA
		Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 );
		cuda_v1 = 1.0;
		cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } );
		vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result );
		std::cout << "cuda_v1 = " << cuda_v1 << std::endl;
		std::cout << "cuda_v2 = " << cuda_v2 << std::endl;
		std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl;
		#endif
		return EXIT_SUCCESS;
		}

Documentation/Tutorials/ForLoops/ParallelForExample.cu

0 → 120000

+1 −0

Original line number	Diff line number	Diff line
		ParallelForExample.cpp
		No newline at end of file