Loading src/TNL/Containers/Algorithms/Multireduction.h +36 −14 Original line number Diff line number Diff line #pragma once #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > class Multireduction { }; template<> class Multireduction< Devices::Cuda > { public: template< typename Operation > bool multireductionOnCudaDevice( const Operation& operation, static bool reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); }; template<> class Multireduction< Devices::Host > { public: template< typename Operation > bool multireductionOnHostDevice( const Operation& operation, static bool reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); }; } // namespace Algorithms } // namespace Containers Loading src/TNL/Containers/Algorithms/Multireduction_impl.h +95 −22 Original line number Diff line number Diff line #pragma once #include "Multireduction.h" //#define CUDA_REDUCTION_PROFILING #include <TNL/Assert.h> Loading Loading @@ -34,7 +36,9 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024 * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnCudaDevice( Operation& operation, bool Multireduction< Devices::Cuda >:: reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, Loading @@ -44,6 +48,7 @@ bool multireductionOnCudaDevice( Operation& operation, { #ifdef HAVE_CUDA Assert( n > 0, ); Assert( size <= ldInput1, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; Loading @@ -62,10 +67,10 @@ bool multireductionOnCudaDevice( Operation& operation, RealType hostArray2[ Multireduction_minGpuDataSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) ) return false; return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } else { return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, nullptr, hostResult ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, nullptr, hostResult ); } } Loading Loading @@ -119,7 +124,7 @@ bool multireductionOnCudaDevice( Operation& operation, * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -144,7 +149,9 @@ bool multireductionOnCudaDevice( Operation& operation, * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnHostDevice( Operation& operation, bool Multireduction< Devices::Host >:: reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, Loading @@ -152,15 +159,81 @@ bool multireductionOnHostDevice( Operation& operation, const typename Operation::RealType* input2, typename Operation::ResultType* result ) { Assert( n > 0, ); Assert( size <= ldInput1, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; const int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) #pragma omp parallel { // first thread initializes the result array #pragma omp single nowait { for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); } // initialize array for thread-local results ResultType r[ n ]; for( int k = 0; k < n; k++ ) r[ k ] = operation.initialValue(); #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 ); } } // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 ); } } // inter-thread reduction of local results for( int k = 0; k < n; k++ ) #pragma omp critical { operation.commonReductionOnDevice( result[ k ], r[ k ] ); } } else { #endif for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 ); } } for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < size; i++ ) for( IndexType i = blocks * block_size; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); } #ifdef HAVE_OPENMP } #endif return true; } Loading src/TNL/Matrices/MatrixOperations.h +6 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,9 @@ public: Assert( m <= lda, ); if( beta != 0.0 ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) { RealType tmp = 0.0; for( int k = 0; k < n; k++ ) Loading @@ -49,6 +52,9 @@ public: } else { // the vector y might be uninitialized, and 0.0 * NaN = NaN #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) { RealType tmp = 0.0; for( int k = 0; k < n; k++ ) Loading src/TNL/Solvers/Linear/CWYGMRES_impl.h +24 −42 Original line number Diff line number Diff line Loading @@ -388,17 +388,9 @@ hauseholder_generate( DeviceVector& Y, if( i > 0 ) { // aux = Y_{i-1}^T * y_i RealType aux[ i ]; if( std::is_same< DeviceType, Devices::Host >::value ) { // this is optimized better than multireductionOnHostDevice DeviceVector y_k; for( int k = 0; k < i; k++ ) { y_k.bind( &Y.getData()[ k * ldSize ], size ); aux[ k ] = y_k.scalarProduct( y_i ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct; if( ! multireductionOnCudaDevice( scalarProduct, if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i, size, Y.getData(), Loading @@ -406,10 +398,9 @@ hauseholder_generate( DeviceVector& Y, y_i.getData(), aux ) ) { std::cerr << "multireductionOnCudaDevice failed" << std::endl; std::cerr << "multireduction failed" << std::endl; throw 1; } } // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading Loading @@ -503,17 +494,9 @@ hauseholder_cwy_transposed( DeviceVector& z, { // aux = Y_i^T * w RealType aux[ i + 1 ]; if( std::is_same< DeviceType, Devices::Host >::value ) { // this is optimized better than multireductionOnHostDevice DeviceVector y_k; for( int k = 0; k <= i; k++ ) { y_k.bind( &Y.getData()[ k * ldSize ], size ); aux[ k ] = y_k.scalarProduct( w ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct; if( ! multireductionOnCudaDevice( scalarProduct, if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i + 1, size, Y.getData(), Loading @@ -521,10 +504,9 @@ hauseholder_cwy_transposed( DeviceVector& z, w.getData(), aux ) ) { std::cerr << "multireductionOnCudaDevice failed" << std::endl; std::cerr << "multireduction failed" << std::endl; throw 1; } } // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading Loading
src/TNL/Containers/Algorithms/Multireduction.h +36 −14 Original line number Diff line number Diff line #pragma once #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > class Multireduction { }; template<> class Multireduction< Devices::Cuda > { public: template< typename Operation > bool multireductionOnCudaDevice( const Operation& operation, static bool reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); }; template<> class Multireduction< Devices::Host > { public: template< typename Operation > bool multireductionOnHostDevice( const Operation& operation, static bool reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); }; } // namespace Algorithms } // namespace Containers Loading
src/TNL/Containers/Algorithms/Multireduction_impl.h +95 −22 Original line number Diff line number Diff line #pragma once #include "Multireduction.h" //#define CUDA_REDUCTION_PROFILING #include <TNL/Assert.h> Loading Loading @@ -34,7 +36,9 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024 * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnCudaDevice( Operation& operation, bool Multireduction< Devices::Cuda >:: reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, Loading @@ -44,6 +48,7 @@ bool multireductionOnCudaDevice( Operation& operation, { #ifdef HAVE_CUDA Assert( n > 0, ); Assert( size <= ldInput1, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; Loading @@ -62,10 +67,10 @@ bool multireductionOnCudaDevice( Operation& operation, RealType hostArray2[ Multireduction_minGpuDataSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) ) return false; return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } else { return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, nullptr, hostResult ); return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, nullptr, hostResult ); } } Loading Loading @@ -119,7 +124,7 @@ bool multireductionOnCudaDevice( Operation& operation, * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -144,7 +149,9 @@ bool multireductionOnCudaDevice( Operation& operation, * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnHostDevice( Operation& operation, bool Multireduction< Devices::Host >:: reduce( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, Loading @@ -152,15 +159,81 @@ bool multireductionOnHostDevice( Operation& operation, const typename Operation::RealType* input2, typename Operation::ResultType* result ) { Assert( n > 0, ); Assert( size <= ldInput1, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; const int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) #pragma omp parallel { // first thread initializes the result array #pragma omp single nowait { for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); } // initialize array for thread-local results ResultType r[ n ]; for( int k = 0; k < n; k++ ) r[ k ] = operation.initialValue(); #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 ); } } // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = blocks * block_size; i < size; i++ ) r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 ); } } // inter-thread reduction of local results for( int k = 0; k < n; k++ ) #pragma omp critical { operation.commonReductionOnDevice( result[ k ], r[ k ] ); } } else { #endif for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < block_size; i++ ) result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 ); } } for( int k = 0; k < n; k++ ) { const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < size; i++ ) for( IndexType i = blocks * block_size; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); } #ifdef HAVE_OPENMP } #endif return true; } Loading
src/TNL/Matrices/MatrixOperations.h +6 −0 Original line number Diff line number Diff line Loading @@ -40,6 +40,9 @@ public: Assert( m <= lda, ); if( beta != 0.0 ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) { RealType tmp = 0.0; for( int k = 0; k < n; k++ ) Loading @@ -49,6 +52,9 @@ public: } else { // the vector y might be uninitialized, and 0.0 * NaN = NaN #ifdef HAVE_OPENMP #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() ) #endif for( IndexType j = 0; j < m; j++ ) { RealType tmp = 0.0; for( int k = 0; k < n; k++ ) Loading
src/TNL/Solvers/Linear/CWYGMRES_impl.h +24 −42 Original line number Diff line number Diff line Loading @@ -388,17 +388,9 @@ hauseholder_generate( DeviceVector& Y, if( i > 0 ) { // aux = Y_{i-1}^T * y_i RealType aux[ i ]; if( std::is_same< DeviceType, Devices::Host >::value ) { // this is optimized better than multireductionOnHostDevice DeviceVector y_k; for( int k = 0; k < i; k++ ) { y_k.bind( &Y.getData()[ k * ldSize ], size ); aux[ k ] = y_k.scalarProduct( y_i ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct; if( ! multireductionOnCudaDevice( scalarProduct, if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i, size, Y.getData(), Loading @@ -406,10 +398,9 @@ hauseholder_generate( DeviceVector& Y, y_i.getData(), aux ) ) { std::cerr << "multireductionOnCudaDevice failed" << std::endl; std::cerr << "multireduction failed" << std::endl; throw 1; } } // [T_i]_{0..i-1} = - T_{i-1} * t_i * aux for( int k = 0; k < i; k++ ) { Loading Loading @@ -503,17 +494,9 @@ hauseholder_cwy_transposed( DeviceVector& z, { // aux = Y_i^T * w RealType aux[ i + 1 ]; if( std::is_same< DeviceType, Devices::Host >::value ) { // this is optimized better than multireductionOnHostDevice DeviceVector y_k; for( int k = 0; k <= i; k++ ) { y_k.bind( &Y.getData()[ k * ldSize ], size ); aux[ k ] = y_k.scalarProduct( w ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct; if( ! multireductionOnCudaDevice( scalarProduct, if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce ( scalarProduct, i + 1, size, Y.getData(), Loading @@ -521,10 +504,9 @@ hauseholder_cwy_transposed( DeviceVector& z, w.getData(), aux ) ) { std::cerr << "multireductionOnCudaDevice failed" << std::endl; std::cerr << "multireduction failed" << std::endl; throw 1; } } // aux = T_i^T * aux // Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place Loading