Loading src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +17 −24 Original line number Diff line number Diff line Loading @@ -297,8 +297,7 @@ struct CudaPrefixSumKernelLauncher */ const Index elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize(); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); Real gridShift = zero; //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; Loading @@ -310,10 +309,10 @@ struct CudaPrefixSumKernelLauncher /**** * Compute current grid size and size of data to be scanned */ const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize ) currentSize = maxGridSize * elementsInBlock; if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, Loading @@ -331,37 +330,31 @@ struct CudaPrefixSumKernelLauncher * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. */ gridsCount = numberOfGrids; gridsCount() = numberOfGrids; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ static void setMaxGridSize( int newMaxGridSize ) { maxGridSize = newMaxGridSize; static int& maxGridSize() { static int maxGridSize = Devices::Cuda::getMaxGridSize(); return maxGridSize; } static void resetMaxGridSize() { maxGridSize = Devices::Cuda::getMaxGridSize(); static void resetMaxGridSize() { maxGridSize() = Devices::Cuda::getMaxGridSize(); } static int maxGridSize; static int gridsCount; static int& gridsCount() { static int gridsCount = -1; return gridsCount; } }; template< PrefixSumType prefixSumType, typename Real, typename Index > int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize(); template< PrefixSumType prefixSumType, typename Real, typename Index > int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; #endif } // namespace Algorithms Loading src/TNL/Devices/Host.h +7 −5 Original line number Diff line number Diff line Loading @@ -113,17 +113,19 @@ public: } protected: static bool& ompEnabled() { static bool& ompEnabled() { #ifdef HAVE_OPENMP static bool ompEnabled( true ); static bool ompEnabled = true; #else static bool ompEnabled( false ); static bool ompEnabled = false; #endif return ompEnabled; } static int& maxThreadsCount() { static int maxThreadsCount( -1 ); static int& maxThreadsCount() { static int maxThreadsCount = -1; return maxThreadsCount; } }; Loading src/UnitTests/Containers/VectorPrefixSumTest.h +13 −13 Original line number Diff line number Diff line Loading @@ -74,11 +74,11 @@ TYPED_TEST( VectorTest, prefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 ); Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; v = 0; v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -86,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); Loading @@ -94,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i + 1 ); Loading @@ -102,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum ) v = 0; v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -110,7 +110,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); Loading Loading @@ -184,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); Loading @@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); Loading @@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); Loading @@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); Loading Loading
src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +17 −24 Original line number Diff line number Diff line Loading @@ -297,8 +297,7 @@ struct CudaPrefixSumKernelLauncher */ const Index elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize(); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); Real gridShift = zero; //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; Loading @@ -310,10 +309,10 @@ struct CudaPrefixSumKernelLauncher /**** * Compute current grid size and size of data to be scanned */ const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; if( currentSize / elementsInBlock > maxGridSize ) currentSize = maxGridSize * elementsInBlock; if( currentSize / elementsInBlock > maxGridSize() ) currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; cudaRecursivePrefixSum( prefixSumType, Loading @@ -331,37 +330,31 @@ struct CudaPrefixSumKernelLauncher * Store the number of CUDA grids for the purpose of unit testing, i.e. * to check if we test the algorithm with more than one CUDA grid. */ gridsCount = numberOfGrids; gridsCount() = numberOfGrids; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ static void setMaxGridSize( int newMaxGridSize ) { maxGridSize = newMaxGridSize; static int& maxGridSize() { static int maxGridSize = Devices::Cuda::getMaxGridSize(); return maxGridSize; } static void resetMaxGridSize() { maxGridSize = Devices::Cuda::getMaxGridSize(); static void resetMaxGridSize() { maxGridSize() = Devices::Cuda::getMaxGridSize(); } static int maxGridSize; static int gridsCount; static int& gridsCount() { static int gridsCount = -1; return gridsCount; } }; template< PrefixSumType prefixSumType, typename Real, typename Index > int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize(); template< PrefixSumType prefixSumType, typename Real, typename Index > int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; #endif } // namespace Algorithms Loading
src/TNL/Devices/Host.h +7 −5 Original line number Diff line number Diff line Loading @@ -113,17 +113,19 @@ public: } protected: static bool& ompEnabled() { static bool& ompEnabled() { #ifdef HAVE_OPENMP static bool ompEnabled( true ); static bool ompEnabled = true; #else static bool ompEnabled( false ); static bool ompEnabled = false; #endif return ompEnabled; } static int& maxThreadsCount() { static int maxThreadsCount( -1 ); static int& maxThreadsCount() { static int maxThreadsCount = -1; return maxThreadsCount; } }; Loading
src/UnitTests/Containers/VectorPrefixSumTest.h +13 −13 Original line number Diff line number Diff line Loading @@ -74,11 +74,11 @@ TYPED_TEST( VectorTest, prefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 ); Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; v = 0; v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -86,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); Loading @@ -94,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i + 1 ); Loading @@ -102,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum ) v = 0; v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -110,7 +110,7 @@ TYPED_TEST( VectorTest, prefixSum ) setLinearSequence( v ); v_host = -1; v_view.prefixSum(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); Loading Loading @@ -184,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); Loading @@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); Loading @@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], i ); Loading @@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) v.setValue( 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) EXPECT_EQ( v_host[ i ], 0 ); Loading @@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 1; i < size; i++ ) EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); Loading