Loading Documentation/Examples/Algorithms/CMakeLists.txt +30 −51 Original line number Diff line number Diff line ADD_SUBDIRECTORY( Segments ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu) ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) set( COMMON_EXAMPLES SortingExample SortingExample2 SortingExample3 ParallelForExample SequentialForExample unrolledForExample.out staticForExample.out ) CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu) ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) set( HOST_EXAMPLES staticForExample unrolledForExample ) if( BUILD_CUDA ) foreach( target IN ITEMS ${COMMON_EXAMPLES} ) cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS ) add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out ) endforeach() else() foreach( target IN ITEMS "${COMMON_EXAMPLES} ${HOST_EXAMPLES}") add_executable( ${target} ${target}.cpp ) add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out ) endforeach() endif() CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu) ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) IF( BUILD_CUDA ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} ) ELSE() ADD_EXECUTABLE( SortingExample SortingExample.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp) ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp) ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS ${HOST_OUTPUTS} ) ENDIF() No newline at end of file ADD_EXECUTABLE(staticForExample staticForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out ) ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS SortingExample.out SortingExample2.out SortingExample3.out ParallelForExample.out reduceArrayExample.out reduceWithArgumentArrayExample.out unrolledForExample.out staticForExample.out ) Documentation/Examples/Algorithms/SequentialForExample.cpp 0 → 100644 +37 −0 Original line number Diff line number Diff line #include <iostream> #include <cstdlib> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/SequentialFor.h> using namespace TNL; using namespace TNL::Containers; template< typename Device > void printVector() { const int size( 36 ); TNL::Containers::Vector< float, Device > v( size, 1.0 ); auto view = v.getView(); auto print = [=] __cuda_callable__ ( int i ) mutable { printf( "v[ %d ] = %f \n", i, view[ i ] ); // we use printf because of compatibility with GPU kernels }; std::cout << "Printing vector using parallel for: " << std::endl; Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print ); std::cout << "Printing vector using sequential for: " << std::endl; Algorithms::SequentialFor< Device >::exec( 0, v.getSize(), print ); } int main( int argc, char* argv[] ) { std::cout << "Example on the host:" << std::endl; printVector< TNL::Devices::Host >(); #ifdef HAVE_CUDA std::cout << "Example on CUDA GPU:" << std::endl; printVector< TNL::Devices::Cuda >(); #endif return EXIT_SUCCESS; } Documentation/Examples/Algorithms/SequentialForExample.cu 0 → 120000 +1 −0 Original line number Diff line number Diff line SequentialForExample.cpp No newline at end of file src/TNL/Algorithms/SequentialFor.h 0 → 100644 +54 −0 Original line number Diff line number Diff line /*************************************************************************** SequentialFor.h - description ------------------- begin : Apr 5, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Algorithms/ParallelFor.h> namespace TNL { namespace Algorithms { /** * \brief Wrapper to ParallelFor which makes it run sequentially. * * It is helpfull for debuging or just sequential for loops on GPUs. */ template< typename Device = Devices::Sequential > struct SequentialFor { /** * \brief Static method for execution of the loop. * * \tparam Index defines the type of indexes over which the loop iterates. * \tparam Function is the type of function to be called in each iteration. * * \param start the for-loop iterates over index interval [start, end). * \param end the for-loop iterates over index interval [start, end). * \param f is the function to be called in each iteration * * \par Example * \include Algorithms/SequentialForExample.cpp * \par Output * \include SequentialForExample.out * */ template< typename Index, typename Function > static void exec( Index start, Index end, Function f ) { for( Index i = start; i < end; i++ ) ParallelFor< Device >::exec( i, i + 1, f ); } }; } // namespace Algorithms } // namespace TNL No newline at end of file Loading
Documentation/Examples/Algorithms/CMakeLists.txt +30 −51 Original line number Diff line number Diff line ADD_SUBDIRECTORY( Segments ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu) ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu) ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) set( COMMON_EXAMPLES SortingExample SortingExample2 SortingExample3 ParallelForExample SequentialForExample unrolledForExample.out staticForExample.out ) CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu) ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) set( HOST_EXAMPLES staticForExample unrolledForExample ) if( BUILD_CUDA ) foreach( target IN ITEMS ${COMMON_EXAMPLES} ) cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS ) add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out ) endforeach() else() foreach( target IN ITEMS "${COMMON_EXAMPLES} ${HOST_EXAMPLES}") add_executable( ${target} ${target}.cpp ) add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out ) endforeach() endif() CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu) ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) IF( BUILD_CUDA ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} ) ELSE() ADD_EXECUTABLE( SortingExample SortingExample.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp) ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp) ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS ${HOST_OUTPUTS} ) ENDIF() No newline at end of file ADD_EXECUTABLE(staticForExample staticForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out ) ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS SortingExample.out SortingExample2.out SortingExample3.out ParallelForExample.out reduceArrayExample.out reduceWithArgumentArrayExample.out unrolledForExample.out staticForExample.out )
Documentation/Examples/Algorithms/SequentialForExample.cpp 0 → 100644 +37 −0 Original line number Diff line number Diff line #include <iostream> #include <cstdlib> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/SequentialFor.h> using namespace TNL; using namespace TNL::Containers; template< typename Device > void printVector() { const int size( 36 ); TNL::Containers::Vector< float, Device > v( size, 1.0 ); auto view = v.getView(); auto print = [=] __cuda_callable__ ( int i ) mutable { printf( "v[ %d ] = %f \n", i, view[ i ] ); // we use printf because of compatibility with GPU kernels }; std::cout << "Printing vector using parallel for: " << std::endl; Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print ); std::cout << "Printing vector using sequential for: " << std::endl; Algorithms::SequentialFor< Device >::exec( 0, v.getSize(), print ); } int main( int argc, char* argv[] ) { std::cout << "Example on the host:" << std::endl; printVector< TNL::Devices::Host >(); #ifdef HAVE_CUDA std::cout << "Example on CUDA GPU:" << std::endl; printVector< TNL::Devices::Cuda >(); #endif return EXIT_SUCCESS; }
Documentation/Examples/Algorithms/SequentialForExample.cu 0 → 120000 +1 −0 Original line number Diff line number Diff line SequentialForExample.cpp No newline at end of file
src/TNL/Algorithms/SequentialFor.h 0 → 100644 +54 −0 Original line number Diff line number Diff line /*************************************************************************** SequentialFor.h - description ------------------- begin : Apr 5, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Algorithms/ParallelFor.h> namespace TNL { namespace Algorithms { /** * \brief Wrapper to ParallelFor which makes it run sequentially. * * It is helpfull for debuging or just sequential for loops on GPUs. */ template< typename Device = Devices::Sequential > struct SequentialFor { /** * \brief Static method for execution of the loop. * * \tparam Index defines the type of indexes over which the loop iterates. * \tparam Function is the type of function to be called in each iteration. * * \param start the for-loop iterates over index interval [start, end). * \param end the for-loop iterates over index interval [start, end). * \param f is the function to be called in each iteration * * \par Example * \include Algorithms/SequentialForExample.cpp * \par Output * \include SequentialForExample.out * */ template< typename Index, typename Function > static void exec( Index start, Index end, Function f ) { for( Index i = start; i < end; i++ ) ParallelFor< Device >::exec( i, i + 1, f ); } }; } // namespace Algorithms } // namespace TNL No newline at end of file