diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 80108bf53074429493d6a7cb85441b3561360db7..7831014155e9a730c1be101c47cb2602cd8d3179 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -75,10 +75,10 @@ class Cuda /**** * This functions helps to count number of CUDA grids depending on the - * number of the CUDA threads and maximum grid size. + * number of the CUDA blocks and maximum grid size. * It is obsolete and it will be replaced by setupThreads. */ - static inline int getNumberOfGrids( const int threads, + static inline int getNumberOfGrids( const int blocks, const int gridSize = getMaxGridSize() ); #ifdef HAVE_CUDA diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index a40375c39c85bcfb2e99decb90bdfe7f56e20ba0..234f45b720fcfa9502b6f4c5c14debfb48ecca64 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -111,10 +111,10 @@ inline int Cuda::getNumberOfBlocks( const int threads, return roundUpDivision( threads, blockSize ); } -inline int Cuda::getNumberOfGrids( const int threads, +inline int Cuda::getNumberOfGrids( const int blocks, const int gridSize ) { - return roundUpDivision( threads, gridSize ); + return roundUpDivision( blocks, gridSize ); } #ifdef HAVE_CUDA diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 950a577bb42434b1cdd33579b8d267f593092bfd..04af2740807b9139ca0b8452b9c1b7bc52f5a8c8 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -205,7 +205,7 @@ struct ParallelFor< Devices::Cuda, Mode > dim3 gridSize; gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); - if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 ) + if( (std::size_t) blockSize.x * gridSize.x >= (std::size_t) end - start ) ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... ); else { // decrease the grid size and align to the number of multiprocessors @@ -257,8 +257,8 @@ struct ParallelFor2D< Devices::Cuda, Mode > gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); dim3 gridCount; - gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX ); - gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY ); + gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); + gridCount.y = roundUpDivision( sizeY, blockSize.y * gridSize.y ); if( gridCount.x == 1 && gridCount.y == 1 ) ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>> @@ -342,9 +342,9 @@ struct ParallelFor3D< Devices::Cuda, Mode > gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) ); dim3 gridCount; - gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX ); - gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY ); - gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ ); + gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); + gridCount.y = roundUpDivision( sizeY, blockSize.y * gridSize.y ); + gridCount.z = roundUpDivision( sizeZ, blockSize.z * gridSize.z ); if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 ) ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>> diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt index 564a7d1347c65b468e2dea5176032735ed424377..f7d3b68b03d3f5e52a23f6bf47f5c25ac9b8f0ae 100644 --- a/src/UnitTests/CMakeLists.txt +++ b/src/UnitTests/CMakeLists.txt @@ -35,6 +35,15 @@ ADD_EXECUTABLE( ObjectTest ObjectTest.cpp ) TARGET_COMPILE_OPTIONS( ObjectTest PRIVATE ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( ObjectTest ${GTEST_BOTH_LIBRARIES} ) +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( ParallelForTest ParallelForTest.cu OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( ParallelForTest ${GTEST_BOTH_LIBRARIES} ) +else() + ADD_EXECUTABLE( ParallelForTest ParallelForTest.cpp ) + TARGET_COMPILE_OPTIONS( ParallelForTest PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( ParallelForTest ${GTEST_BOTH_LIBRARIES} ) +endif() + ADD_EXECUTABLE( SaveAndLoadMeshfunctionTest SaveAndLoadMeshfunctionTest.cpp ) TARGET_COMPILE_OPTIONS( SaveAndLoadMeshfunctionTest PRIVATE ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( SaveAndLoadMeshfunctionTest ${GTEST_BOTH_LIBRARIES} ) @@ -46,4 +55,5 @@ endif() ADD_TEST( FileTest ${EXECUTABLE_OUTPUT_PATH}/FileTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( StringTest ${EXECUTABLE_OUTPUT_PATH}/StringTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( ObjectTest ${EXECUTABLE_OUTPUT_PATH}/ObjectTest${CMAKE_EXECUTABLE_SUFFIX} ) +ADD_TEST( ParallelForTest ${EXECUTABLE_OUTPUT_PATH}/ParallelForTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SaveAndLoadMeshfunctionTest ${EXECUTABLE_OUTPUT_PATH}/SaveAndLoadMeshfunctionTest${CMAKE_EXECUTABLE_SUFFIX} ) diff --git a/src/UnitTests/ParallelForTest.cpp b/src/UnitTests/ParallelForTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed0e90c0b15153b2c746c9138d6afb25a9afc8ad --- /dev/null +++ b/src/UnitTests/ParallelForTest.cpp @@ -0,0 +1 @@ +#include "ParallelForTest.h" diff --git a/src/UnitTests/ParallelForTest.cu b/src/UnitTests/ParallelForTest.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed0e90c0b15153b2c746c9138d6afb25a9afc8ad --- /dev/null +++ b/src/UnitTests/ParallelForTest.cu @@ -0,0 +1 @@ +#include "ParallelForTest.h" diff --git a/src/UnitTests/ParallelForTest.h b/src/UnitTests/ParallelForTest.h new file mode 100644 index 0000000000000000000000000000000000000000..95455286e796f536215166c30c8173d52a14e785 --- /dev/null +++ b/src/UnitTests/ParallelForTest.h @@ -0,0 +1,290 @@ +/*************************************************************************** + ParallelForTest.h - description + ------------------- + begin : Jul 16, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include <TNL/Devices/Host.h> +#include <TNL/Devices/Cuda.h> +#include <TNL/Containers/Array.h> +#include <TNL/ParallelFor.h> + +#ifdef HAVE_GTEST +#include <gtest/gtest.h> +#endif + +using namespace TNL; + +#ifdef HAVE_GTEST +TEST( ParallelForTest, 1D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel = [=] (int i) mutable + { + view[i] = i; + }; + ParallelFor< Devices::Host >::exec( 0, size, kernel ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 2D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] (int i, int j) mutable + { + view[i] = i; + }; + ParallelFor2D< Devices::Host >::exec( 0, 0, size, 1, kernel1 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] (int i, int j) mutable + { + view[j] = j; + }; + ParallelFor2D< Devices::Host >::exec( 0, 0, 1, size, kernel2 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 3D_host ) +{ + using Array = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + Array expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] (int i, int j, int k) mutable + { + view[i] = i; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, size, 1, 1, kernel1 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] (int i, int j, int k) mutable + { + view[j] = j; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, 1, size, 1, kernel2 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel3 = [=] (int i, int j, int k) mutable + { + view[k] = k; + }; + ParallelFor3D< Devices::Host >::exec( 0, 0, 0, 1, 1, size, kernel3 ); + + if( a != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( a[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +#ifdef HAVE_CUDA +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_1D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel = [=] __cuda_callable__ (int i) mutable + { + view[i] = i; + }; + ParallelFor< Devices::Cuda >::exec( 0, size, kernel ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 1D_cuda ) +{ + test_1D_cuda(); +} + +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_2D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] __cuda_callable__ (int i, int j) mutable + { + view[i] = i; + }; + ParallelFor2D< Devices::Cuda >::exec( 0, 0, size, 1, kernel1 ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] __cuda_callable__ (int i, int j) mutable + { + view[j] = j; + }; + ParallelFor2D< Devices::Cuda >::exec( 0, 0, 1, size, kernel2 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 2D_cuda ) +{ + test_2D_cuda(); +} + +// nvcc does not allow __cuda_callable__ lambdas inside private regions +void test_3D_cuda() +{ + using Array = Containers::Array< int, Devices::Cuda >; + using ArrayHost = Containers::Array< int, Devices::Host >; + Array a; + for (int size = 100; size <= 100000000; size *= 100) + { + ArrayHost expected; + expected.setSize( size ); + for (int i = 0; i < size; i++) + expected[ i ] = i; + + a.setSize( size ); + a.setValue( 0 ); + auto view = a.getView(); + auto kernel1 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[i] = i; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, size, 1, 1, kernel1 ); + + ArrayHost ah; + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel2 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[j] = j; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, 1, size, 1, kernel2 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + + a.setValue( 0 ); + auto kernel3 = [=] __cuda_callable__ (int i, int j, int k) mutable + { + view[k] = k; + }; + ParallelFor3D< Devices::Cuda >::exec( 0, 0, 0, 1, 1, size, kernel3 ); + + ah = a; + if( ah != expected ) { + for (int i = 0; i < size; i++) + ASSERT_EQ( ah[i], i ) << "First index at which the result is wrong is i = " << i; + } + } +} + +TEST( ParallelForTest, 3D_cuda ) +{ + test_3D_cuda(); +} +#endif +#endif + +#include "main.h"