diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt index 8afdc50cc9771126ecbebe1f64c57d3aa1fd1807..c200642f526cb84e06601b1c0a57c823cc885102 100644 --- a/Documentation/Examples/Algorithms/CMakeLists.txt +++ b/Documentation/Examples/Algorithms/CMakeLists.txt @@ -1,56 +1,35 @@ ADD_SUBDIRECTORY( Segments ) -IF( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu) - ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) - - CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu) - ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) - - CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu) - ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) - - CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu) - ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) - - CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu) - ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) - - CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu) - ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) -ELSE() - ADD_EXECUTABLE( SortingExample SortingExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) - - ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) - ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) - - ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp) - ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) - - ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) - - ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out ) - - ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out ) -ENDIF() - -ADD_EXECUTABLE(staticForExample staticForExample.cpp) -ADD_CUSTOM_COMMAND( COMMAND staticForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/staticForExample.out OUTPUT staticForExample.out ) - -ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp) -ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out ) - -ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS - SortingExample.out - SortingExample2.out - SortingExample3.out - ParallelForExample.out - reduceArrayExample.out - reduceWithArgumentArrayExample.out +set( COMMON_EXAMPLES + SortingExample + SortingExample2 + SortingExample3 + ParallelForExample + SequentialForExample unrolledForExample.out staticForExample.out ) + +set( HOST_EXAMPLES + staticForExample + unrolledForExample +) +if( BUILD_CUDA ) + foreach( target IN ITEMS ${COMMON_EXAMPLES} ) + cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS ) + add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) + set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out ) + endforeach() +else() + foreach( target IN ITEMS "${COMMON_EXAMPLES} ${HOST_EXAMPLES}") + add_executable( ${target} ${target}.cpp ) + add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out ) + set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out ) + endforeach() +endif() + +IF( BUILD_CUDA ) + ADD_CUSTOM_TARGET( RunAlgorithmsExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} ) +ELSE() + ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS ${HOST_OUTPUTS} ) +ENDIF() \ No newline at end of file diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cpp b/Documentation/Examples/Algorithms/SequentialForExample.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d127a33a973f6c7ab9482680dd70bd6c321b7999 --- /dev/null +++ b/Documentation/Examples/Algorithms/SequentialForExample.cpp @@ -0,0 +1,37 @@ +#include <iostream> +#include <cstdlib> +#include <TNL/Containers/Vector.h> +#include <TNL/Algorithms/ParallelFor.h> +#include <TNL/Algorithms/SequentialFor.h> + +using namespace TNL; +using namespace TNL::Containers; + +template< typename Device > +void printVector() +{ + const int size( 36 ); + TNL::Containers::Vector< float, Device > v( size, 1.0 ); + auto view = v.getView(); + auto print = [=] __cuda_callable__ ( int i ) mutable { + printf( "v[ %d ] = %f \n", i, view[ i ] ); // we use printf because of compatibility with GPU kernels + }; + std::cout << "Printing vector using parallel for: " << std::endl; + Algorithms::ParallelFor< Device >::exec( 0, v.getSize(), print ); + + std::cout << "Printing vector using sequential for: " << std::endl; + Algorithms::SequentialFor< Device >::exec( 0, v.getSize(), print ); +} + +int main( int argc, char* argv[] ) +{ + std::cout << "Example on the host:" << std::endl; + printVector< TNL::Devices::Host >(); + +#ifdef HAVE_CUDA + std::cout << "Example on CUDA GPU:" << std::endl; + printVector< TNL::Devices::Cuda >(); +#endif + return EXIT_SUCCESS; +} + diff --git a/Documentation/Examples/Algorithms/SequentialForExample.cu b/Documentation/Examples/Algorithms/SequentialForExample.cu new file mode 120000 index 0000000000000000000000000000000000000000..ac78b379b6e3ed91150ba91d3c7c465a41f70501 --- /dev/null +++ b/Documentation/Examples/Algorithms/SequentialForExample.cu @@ -0,0 +1 @@ +SequentialForExample.cpp \ No newline at end of file diff --git a/src/TNL/Algorithms/SequentialFor.h b/src/TNL/Algorithms/SequentialFor.h new file mode 100644 index 0000000000000000000000000000000000000000..ea783ca33dbfb875c15b55be12fe3c0aa2480c0e --- /dev/null +++ b/src/TNL/Algorithms/SequentialFor.h @@ -0,0 +1,54 @@ +/*************************************************************************** + SequentialFor.h - description + ------------------- + begin : Apr 5, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Algorithms/ParallelFor.h> + + +namespace TNL { + namespace Algorithms { + +/** + * \brief Wrapper to ParallelFor which makes it run sequentially. + * + * It is helpfull for debuging or just sequential for loops on GPUs. + */ +template< typename Device = Devices::Sequential > +struct SequentialFor +{ + /** + * \brief Static method for execution of the loop. + * + * \tparam Index defines the type of indexes over which the loop iterates. + * \tparam Function is the type of function to be called in each iteration. + * + * \param start the for-loop iterates over index interval [start, end). + * \param end the for-loop iterates over index interval [start, end). + * \param f is the function to be called in each iteration + * + * \par Example + * \include Algorithms/SequentialForExample.cpp + * \par Output + * \include SequentialForExample.out + * + */ + template< typename Index, + typename Function > + static void exec( Index start, Index end, Function f ) + { + for( Index i = start; i < end; i++ ) + ParallelFor< Device >::exec( i, i + 1, f ); + } +}; + + + } // namespace Algorithms +} // namespace TNL \ No newline at end of file