Loading Documentation/Tutorials/CMakeLists.txt +2 −1 Original line number Diff line number Diff line add_subdirectory( Arrays ) add_subdirectory( Vectors ) add_subdirectory( ReductionAndScan ) add_subdirectory( ForLoops ) Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp 0 → 100644 +20 KiB File added.No diff preview for this file type. View file Documentation/Tutorials/ForLoops/CMakeLists.txt 0 → 100644 +9 −0 Original line number Diff line number Diff line IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu ) ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) ENDIF() IF( BUILD_CUDA ) ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS ParallelForExample.out ) ENDIF() Documentation/Tutorials/ForLoops/ParallelForExample.cpp 0 → 100644 +58 −0 Original line number Diff line number Diff line #include <iostream> #include <cstdlib> #include <TNL/Containers/Vector.h> using namespace TNL; using namespace TNL::Containers; using namespace TNL::Containers::Algorithms; template< typename Device > void vectorSum( const Vector< double, Device >& v1, const Vector< double, Device >& v2, const double& c, Vector< double, Device >& result ) { /**** * Get vectors view which can be captured by lambda. */ auto v1_view = v1.getConstView(); auto v2_view = v2.getConstView(); auto result_view = result.getView(); /**** * The sum function. */ auto sum = [=] __cuda_callable__ ( int i, const double c ) mutable { result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; }; ParallelFor< Device >::exec( 0, v1.getSize(), sum, c ); } int main( int argc, char* argv[] ) { /*** * Firstly, test the vectors sum on CPU. */ Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 ); host_v1 = 1.0; host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } ); vectorSum( host_v1, host_v2, 2.0, host_result ); std::cout << "host_v1 = " << host_v1 << std::endl; std::cout << "host_v2 = " << host_v2 << std::endl; std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl; /*** * And then also on GPU. */ #ifdef HAVE_CUDA Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 ); cuda_v1 = 1.0; cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } ); vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result ); std::cout << "cuda_v1 = " << cuda_v1 << std::endl; std::cout << "cuda_v2 = " << cuda_v2 << std::endl; std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl; #endif return EXIT_SUCCESS; } Documentation/Tutorials/ForLoops/ParallelForExample.cu 0 → 120000 +1 −0 Original line number Diff line number Diff line ParallelForExample.cpp No newline at end of file Loading
Documentation/Tutorials/CMakeLists.txt +2 −1 Original line number Diff line number Diff line add_subdirectory( Arrays ) add_subdirectory( Vectors ) add_subdirectory( ReductionAndScan ) add_subdirectory( ForLoops )
Documentation/Tutorials/ForLoops/.tutorial_04_ForLoops.md.swp 0 → 100644 +20 KiB File added.No diff preview for this file type. View file
Documentation/Tutorials/ForLoops/CMakeLists.txt 0 → 100644 +9 −0 Original line number Diff line number Diff line IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( ParallelForExample ParallelForExample.cu ) ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) ENDIF() IF( BUILD_CUDA ) ADD_CUSTOM_TARGET( ForLoops-cuda ALL DEPENDS ParallelForExample.out ) ENDIF()
Documentation/Tutorials/ForLoops/ParallelForExample.cpp 0 → 100644 +58 −0 Original line number Diff line number Diff line #include <iostream> #include <cstdlib> #include <TNL/Containers/Vector.h> using namespace TNL; using namespace TNL::Containers; using namespace TNL::Containers::Algorithms; template< typename Device > void vectorSum( const Vector< double, Device >& v1, const Vector< double, Device >& v2, const double& c, Vector< double, Device >& result ) { /**** * Get vectors view which can be captured by lambda. */ auto v1_view = v1.getConstView(); auto v2_view = v2.getConstView(); auto result_view = result.getView(); /**** * The sum function. */ auto sum = [=] __cuda_callable__ ( int i, const double c ) mutable { result_view[ i ] = v1_view[ i ] + v2_view[ i ] + c; }; ParallelFor< Device >::exec( 0, v1.getSize(), sum, c ); } int main( int argc, char* argv[] ) { /*** * Firstly, test the vectors sum on CPU. */ Vector< double, Devices::Host > host_v1( 10 ), host_v2( 10 ), host_result( 10 ); host_v1 = 1.0; host_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } ); vectorSum( host_v1, host_v2, 2.0, host_result ); std::cout << "host_v1 = " << host_v1 << std::endl; std::cout << "host_v2 = " << host_v2 << std::endl; std::cout << "The sum of the vectors on CPU is " << host_result << "." << std::endl; /*** * And then also on GPU. */ #ifdef HAVE_CUDA Vector< double, Devices::Cuda > cuda_v1( 10 ), cuda_v2( 10 ), cuda_result( 10 ); cuda_v1 = 1.0; cuda_v2.evaluate( []__cuda_callable__ ( int i )->double { return i; } ); vectorSum( cuda_v1, cuda_v2, 2.0, cuda_result ); std::cout << "cuda_v1 = " << cuda_v1 << std::endl; std::cout << "cuda_v2 = " << cuda_v2 << std::endl; std::cout << "The sum of the vectors on GPU is " << cuda_result << "." << std::endl; #endif return EXIT_SUCCESS; }
Documentation/Tutorials/ForLoops/ParallelForExample.cu 0 → 120000 +1 −0 Original line number Diff line number Diff line ParallelForExample.cpp No newline at end of file