From c8a7469c7910f5cfa6ea42cc3a188ed0463cb5de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz> Date: Tue, 16 Jul 2019 22:40:46 +0200 Subject: [PATCH] Fixed ParallelFor and added tests This reverts commit cf34dca066ff8368c3d591e7bf74145a4ea025f9. --- src/TNL/Devices/Cuda.h | 4 +- src/TNL/Devices/Cuda_impl.h | 4 +- src/TNL/ParallelFor.h | 12 +- src/UnitTests/CMakeLists.txt | 10 ++ src/UnitTests/ParallelForTest.cpp | 1 + src/UnitTests/ParallelForTest.cu | 1 + src/UnitTests/ParallelForTest.h | 290 ++++++++++++++++++++++++++++++ 7 files changed, 312 insertions(+), 10 deletions(-) create mode 100644 src/UnitTests/ParallelForTest.cpp create mode 100644 src/UnitTests/ParallelForTest.cu create mode 100644 src/UnitTests/ParallelForTest.h diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 80108bf530..7831014155 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -75,10 +75,10 @@ class Cuda /**** * This functions helps to count number of CUDA grids depending on the - * number of the CUDA threads and maximum grid size. + * number of the CUDA blocks and maximum grid size. * It is obsolete and it will be replaced by setupThreads. */ - static inline int getNumberOfGrids( const int threads, + static inline int getNumberOfGrids( const int blocks, const int gridSize = getMaxGridSize() ); #ifdef HAVE_CUDA diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index a40375c39c..234f45b720 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -111,10 +111,10 @@ inline int Cuda::getNumberOfBlocks( const int threads, return roundUpDivision( threads, blockSize ); } -inline int Cuda::getNumberOfGrids( const int threads, +inline int Cuda::getNumberOfGrids( const int blocks, const int gridSize ) { - return roundUpDivision( threads, gridSize ); + return roundUpDivision( blocks, gridSize ); } #ifdef HAVE_CUDA diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 950a577bb4..04af274080 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -205,7 +205,7 @@ struct ParallelFor< Devices::Cuda, Mode > dim3 gridSize; gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); - if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 ) + if( (std::size_t) blockSize.x * gridSize.x >= (std::size_t) end - start ) ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... ); else { // decrease the grid size and align to the number of multiprocessors @@ -257,8 +257,8 @@ struct ParallelFor2D< Devices::Cuda, Mode > gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); dim3 gridCount; - gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX ); - gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY ); + gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); + gridCount.y = roundUpDivision( sizeY, blockSize.y * gridSize.y ); if( gridCount.x == 1 && gridCount.y == 1 ) ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>> @@ -342,9 +342,9 @@ struct ParallelFor3D< Devices::Cuda, Mode > gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) ); dim3 gridCount; - gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX ); - gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY ); - gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ ); + gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); + gridCount.y = roundUpDivision( sizeY, blockSize.y * gridSize.y ); + gridCount.z = roundUpDivision( sizeZ, blockSize.z * gridSize.z ); if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 ) ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>> diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt index 564a7d1347..f7d3b68b03 100644 --- a/src/UnitTests/CMakeLists.txt +++ b/src/UnitTests/CMakeLists.txt @@ -35,6 +35,15 @@ ADD_EXECUTABLE( ObjectTest ObjectTest.cpp ) TARGET_COMPILE_OPTIONS( ObjectTest PRIVATE ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( ObjectTest ${GTEST_BOTH_LIBRARIES} ) +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( ParallelForTest ParallelForTest.cu OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( ParallelForTest ${GTEST_BOTH_LIBRARIES} ) +else() + ADD_EXECUTABLE( ParallelForTest ParallelForTest.cpp ) + TARGET_COMPILE_OPTIONS( ParallelForTest PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( ParallelForTest ${GTEST_BOTH_LIBRARIES} ) +endif() + ADD_EXECUTABLE( SaveAndLoadMeshfunctionTest SaveAndLoadMeshfunctionTest.cpp ) TARGET_COMPILE_OPTIONS( SaveAndLoadMeshfunctionTest PRIVATE ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( SaveAndLoadMeshfunctionTest ${GTEST_BOTH_LIBRARIES} ) @@ -46,4 +55,5 @@ endif() ADD_TEST( FileTest ${EXECUTABLE_OUTPUT_PATH}/FileTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( StringTest ${EXECUTABLE_OUTPUT_PATH}/StringTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( ObjectTest ${EXECUTABLE_OUTPUT_PATH}/ObjectTest${CMAKE_EXECUTABLE_SUFFIX} ) +ADD_TEST( ParallelForTest ${EXECUTABLE_OUTPUT_PATH}/ParallelForTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SaveAndLoadMeshfunctionTest ${EXECUTABLE_OUTPUT_PATH}/SaveAndLoadMeshfunctionTest${CMAKE_EXECUTABLE_SUFFIX} ) diff --git a/src/UnitTests/ParallelForTest.cpp b/src/UnitTests/ParallelForTest.cpp new file mode 100644 index 0000000000..ed0e90c0b1 --- /dev/null +++ b/src/UnitTests/ParallelForTest.cpp @@ -0,0 +1 @@ +#include "ParallelForTest.h" diff --git a/src/UnitTests/ParallelForTest.cu b/src/UnitTests/ParallelForTest.cu new file mode 100644 index 0000000000..ed0e90c0b1 --- /dev/null +++ b/src/UnitTests/ParallelForTest.cu @@ -0,0 +1 @@ +#include "ParallelForTest.h" diff --git a/src/UnitTests/ParallelForTest.h b/src/UnitTests/ParallelForTest.h new file mode 100644 index 0000000000..95455286e7 --- /dev/null +++ b/src/UnitTests/ParallelForTest.h @@ -0,0 +1,290 @@ +/*************************************************************************** + ParallelForTest.h - description + ------------------- + begin : Jul 16, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include <TNL/Devices/Host.h> +#include <TNL/Devices/Cuda.h> +#include <TNL/Containers/Array.h> +#include <TNL/ParallelFor.h> + +#ifdef HAVE_GTEST +#include <gtest/gtest.h> +#endif + +using namespace TNL; + +#ifdef HAVE_GTEST +TEST( ParallelForTest, 1D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel = [=] (int i) mutable + { + view[i] = i; + }; + ParallelFor< Devices::Host >::exec( 0, size, kernel ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 2D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] (int i, int j) mutable + { + view[i] = i; + }; + ParallelFor2D< Devices::Host >::exec( 0, 0, size, 1, kernel1 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] (int i, int j) mutable + { + view[j] = j; + }; + ParallelFor2D< Devices::Host >::exec( 0, 0, 1, size, kernel2 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 3D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] (int i, int j, int k) mutable + { + view[i] = i; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, size, 1, 1, kernel1 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] (int i, int j, int k) mutable + { + view[j] = j; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, 1, size, 1, kernel2 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel3 = [=] (int i, int j, int k) mutable + { + view[k] = k; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, 1, 1, size, kernel3 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +#ifdef HAVE_CUDA +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_1D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel = [=] __cuda_callable__ (int i) mutable + { + view[i] = i; + }; + ParallelFor< Devices::Cuda >::exec( 0, size, kernel ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 1D_cuda ) +{ + test_1D_cuda(); +} + +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_2D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] __cuda_callable__ (int i, int j) mutable + { + view[i] = i; + }; + ParallelFor2D< Devices::Cuda >::exec( 0, 0, size, 1, kernel1 ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] __cuda_callable__ (int i, int j) mutable + { + view[j] = j; + }; + ParallelFor2D< Devices::Cuda >::exec( 0, 0, 1, size, kernel2 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 2D_cuda ) +{ + test_2D_cuda(); +} + +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_3D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[i] = i; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, size, 1, 1, kernel1 ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[j] = j; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, 1, size, 1, kernel2 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel3 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[k] = k; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, 1, 1, size, kernel3 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 3D_cuda ) +{ + test_3D_cuda(); +} +#endif +#endif + +#include "main.h" -- GitLab