Loading src/TNL/Containers/Algorithms/Multireduction.h +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class Multireduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading @@ -45,7 +45,7 @@ class Multireduction< Devices::Host > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading @@ -60,7 +60,7 @@ class Multireduction< Devices::MIC > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading src/TNL/Containers/Algorithms/Multireduction_impl.h +7 −11 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024 * hostResult: output array of size = n */ template< typename Operation, typename Index > bool void Multireduction< Devices::Cuda >:: reduce( Operation& operation, const int n, Loading Loading @@ -80,11 +80,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; _DT2 hostArray2[ Multireduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } else { return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -144,7 +145,6 @@ reduce( Operation& operation, #endif TNL_CHECK_CUDA_DEVICE; return true; #else throw Exceptions::CudaSupportMissing(); #endif Loading @@ -161,7 +161,7 @@ reduce( Operation& operation, * hostResult: output array of size = n */ template< typename Operation, typename Index > bool void Multireduction< Devices::Host >:: reduce( Operation& operation, const int n, Loading Loading @@ -247,12 +247,10 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP } #endif return true; } template< typename Operation, typename Index > bool void Multireduction< Devices::MIC >:: reduce( Operation& operation, const int n, Loading @@ -265,11 +263,9 @@ reduce( Operation& operation, TNL_ASSERT( n > 0, ); TNL_ASSERT( size <= ldInput1, ); std::cout << "Not Implemented yet Multireduction< Devices::MIC >::reduce" << std::endl; return true; throw std::runtime_error("Not Implemented yet Multireduction< Devices::MIC >::reduce"); } } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/Algorithms/Reduction.h +3 −3 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading @@ -43,7 +43,7 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading @@ -56,7 +56,7 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading src/TNL/Containers/Algorithms/Reduction_impl.h +8 −10 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ namespace Algorithms { static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > bool void Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, Loading Loading @@ -75,11 +75,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -160,14 +161,13 @@ reduce( Operation& operation, } TNL_CHECK_CUDA_DEVICE; return true; #else throw Exceptions::CudaSupportMissing(); #endif }; template< typename Operation, typename Index > bool void Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, Loading Loading @@ -224,8 +224,6 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP } #endif return true; } } // namespace Algorithms Loading src/TNL/Solvers/Linear/CWYGMRES_impl.h +4 −12 Original line number Diff line number Diff line Loading @@ -398,18 +398,14 @@ hauseholder_generate( DeviceVector& Y, // aux = Y_{i-1}^T * y_i RealType aux[ i ]; Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i, size, Y.getData(), ldSize, y_i.getData(), aux ) ) { std::cerr << "multireduction failed" << std::endl; throw 1; } aux ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading Loading @@ -497,18 +493,14 @@ hauseholder_cwy_transposed( DeviceVector& z, // aux = Y_i^T * w RealType aux[ i + 1 ]; Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i + 1, size, Y.getData(), ldSize, w.getData(), aux ) ) { std::cerr << "multireduction failed" << std::endl; throw 1; } aux ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading Loading
src/TNL/Containers/Algorithms/Multireduction.h +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class Multireduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading @@ -45,7 +45,7 @@ class Multireduction< Devices::Host > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading @@ -60,7 +60,7 @@ class Multireduction< Devices::MIC > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const int n, const Index size, Loading
src/TNL/Containers/Algorithms/Multireduction_impl.h +7 −11 Original line number Diff line number Diff line Loading @@ -49,7 +49,7 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024 * hostResult: output array of size = n */ template< typename Operation, typename Index > bool void Multireduction< Devices::Cuda >:: reduce( Operation& operation, const int n, Loading Loading @@ -80,11 +80,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; _DT2 hostArray2[ Multireduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } else { return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -144,7 +145,6 @@ reduce( Operation& operation, #endif TNL_CHECK_CUDA_DEVICE; return true; #else throw Exceptions::CudaSupportMissing(); #endif Loading @@ -161,7 +161,7 @@ reduce( Operation& operation, * hostResult: output array of size = n */ template< typename Operation, typename Index > bool void Multireduction< Devices::Host >:: reduce( Operation& operation, const int n, Loading Loading @@ -247,12 +247,10 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP } #endif return true; } template< typename Operation, typename Index > bool void Multireduction< Devices::MIC >:: reduce( Operation& operation, const int n, Loading @@ -265,11 +263,9 @@ reduce( Operation& operation, TNL_ASSERT( n > 0, ); TNL_ASSERT( size <= ldInput1, ); std::cout << "Not Implemented yet Multireduction< Devices::MIC >::reduce" << std::endl; return true; throw std::runtime_error("Not Implemented yet Multireduction< Devices::MIC >::reduce"); } } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/Algorithms/Reduction.h +3 −3 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading @@ -43,7 +43,7 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading @@ -56,7 +56,7 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static bool static void reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +8 −10 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ namespace Algorithms { static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > bool void Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, Loading Loading @@ -75,11 +75,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -160,14 +161,13 @@ reduce( Operation& operation, } TNL_CHECK_CUDA_DEVICE; return true; #else throw Exceptions::CudaSupportMissing(); #endif }; template< typename Operation, typename Index > bool void Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, Loading Loading @@ -224,8 +224,6 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP } #endif return true; } } // namespace Algorithms Loading
src/TNL/Solvers/Linear/CWYGMRES_impl.h +4 −12 Original line number Diff line number Diff line Loading @@ -398,18 +398,14 @@ hauseholder_generate( DeviceVector& Y, // aux = Y_{i-1}^T * y_i RealType aux[ i ]; Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i, size, Y.getData(), ldSize, y_i.getData(), aux ) ) { std::cerr << "multireduction failed" << std::endl; throw 1; } aux ); // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading Loading @@ -497,18 +493,14 @@ hauseholder_cwy_transposed( DeviceVector& z, // aux = Y_i^T * w RealType aux[ i + 1 ]; Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i + 1, size, Y.getData(), ldSize, w.getData(), aux ) ) { std::cerr << "multireduction failed" << std::endl; throw 1; } aux ); // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading