Loading src/TNL/Algorithms/detail/CudaScanKernel.h +7 −2 Original line number Diff line number Diff line Loading @@ -436,7 +436,10 @@ CudaScanKernelUniformShift( OutputView output, */ template< ScanType scanType, ScanPhaseType phaseType, int blockSize = 256, typename ValueType, // use blockSize=256 for 32-bit value types, scale with sizeof(ValueType) // to keep shared memory requirements constant int blockSize = 256 * 4 / sizeof(ValueType), // valuesPerThread should be odd to avoid shared memory bank conflicts int valuesPerThread = 7 > struct CudaScanKernelLauncher Loading Loading @@ -520,6 +523,7 @@ struct CudaScanKernelLauncher Reduction&& reduction, typename OutputArray::ValueType zero ) { static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" ); using Index = typename InputArray::IndexType; if( end - begin <= blockSize * valuesPerThread ) { Loading Loading @@ -639,7 +643,7 @@ struct CudaScanKernelLauncher // blockResults now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform( blockResults, blockResults, 0, Loading Loading @@ -689,6 +693,7 @@ struct CudaScanKernelLauncher typename OutputArray::ValueType zero, typename OutputArray::ValueType shift ) { static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" ); using Index = typename InputArray::IndexType; // if the input was already scanned with just one block in the first phase, Loading src/TNL/Algorithms/detail/Scan.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -278,7 +278,7 @@ perform( const InputArray& input, if( end <= begin ) return; detail::CudaScanKernelLauncher< Type, PhaseType >::perform( detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform( input, output, begin, Loading Loading @@ -312,7 +312,7 @@ performFirstPhase( const InputArray& input, return block_results; } return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase( return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase( input, output, begin, Loading Loading @@ -346,7 +346,7 @@ performSecondPhase( const InputArray& input, if( end <= begin ) return; detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase( detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase( input, output, blockShifts, Loading src/UnitTests/Algorithms/distributedScanTest.h +10 −10 Original line number Diff line number Diff line Loading @@ -87,14 +87,14 @@ protected: #ifdef HAVE_CUDA if( std::is_same< DeviceType, Devices::Cuda >::value ) { CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; } #endif } Loading @@ -106,8 +106,8 @@ protected: // skip the check for too small arrays if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) { // we don't care which kernel launcher was actually used const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() ); const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() ); EXPECT_GT( gridsCount, 1 ); } #endif Loading src/UnitTests/Algorithms/scanTest.h +10 −10 Original line number Diff line number Diff line Loading @@ -59,14 +59,14 @@ protected: #ifdef HAVE_CUDA if( std::is_same< DeviceType, Devices::Cuda >::value ) { CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; } #endif } Loading @@ -78,8 +78,8 @@ protected: // skip the check for too small arrays if( array.getSize() > 256 ) { // we don't care which kernel launcher was actually used const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() ); const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() ); EXPECT_GT( gridsCount, 1 ); } #endif Loading Loading
src/TNL/Algorithms/detail/CudaScanKernel.h +7 −2 Original line number Diff line number Diff line Loading @@ -436,7 +436,10 @@ CudaScanKernelUniformShift( OutputView output, */ template< ScanType scanType, ScanPhaseType phaseType, int blockSize = 256, typename ValueType, // use blockSize=256 for 32-bit value types, scale with sizeof(ValueType) // to keep shared memory requirements constant int blockSize = 256 * 4 / sizeof(ValueType), // valuesPerThread should be odd to avoid shared memory bank conflicts int valuesPerThread = 7 > struct CudaScanKernelLauncher Loading Loading @@ -520,6 +523,7 @@ struct CudaScanKernelLauncher Reduction&& reduction, typename OutputArray::ValueType zero ) { static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" ); using Index = typename InputArray::IndexType; if( end - begin <= blockSize * valuesPerThread ) { Loading Loading @@ -639,7 +643,7 @@ struct CudaScanKernelLauncher // blockResults now contains scan results for each block. The first phase // ends by computing an exclusive scan of this array. CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform( blockResults, blockResults, 0, Loading Loading @@ -689,6 +693,7 @@ struct CudaScanKernelLauncher typename OutputArray::ValueType zero, typename OutputArray::ValueType shift ) { static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" ); using Index = typename InputArray::IndexType; // if the input was already scanned with just one block in the first phase, Loading
src/TNL/Algorithms/detail/Scan.hpp +3 −3 Original line number Diff line number Diff line Loading @@ -278,7 +278,7 @@ perform( const InputArray& input, if( end <= begin ) return; detail::CudaScanKernelLauncher< Type, PhaseType >::perform( detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform( input, output, begin, Loading Loading @@ -312,7 +312,7 @@ performFirstPhase( const InputArray& input, return block_results; } return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase( return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase( input, output, begin, Loading Loading @@ -346,7 +346,7 @@ performSecondPhase( const InputArray& input, if( end <= begin ) return; detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase( detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase( input, output, blockShifts, Loading
src/UnitTests/Algorithms/distributedScanTest.h +10 −10 Original line number Diff line number Diff line Loading @@ -87,14 +87,14 @@ protected: #ifdef HAVE_CUDA if( std::is_same< DeviceType, Devices::Cuda >::value ) { CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; } #endif } Loading @@ -106,8 +106,8 @@ protected: // skip the check for too small arrays if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) { // we don't care which kernel launcher was actually used const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() ); const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() ); EXPECT_GT( gridsCount, 1 ); } #endif Loading
src/UnitTests/Algorithms/scanTest.h +10 −10 Original line number Diff line number Diff line Loading @@ -59,14 +59,14 @@ protected: #ifdef HAVE_CUDA if( std::is_same< DeviceType, Devices::Cuda >::value ) { CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize(); CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3; } #endif } Loading @@ -78,8 +78,8 @@ protected: // skip the check for too small arrays if( array.getSize() > 256 ) { // we don't care which kernel launcher was actually used const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() ); const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(), CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() ); EXPECT_GT( gridsCount, 1 ); } #endif Loading