Loading src/TNL/Containers/Algorithms/Multireduction_impl.h +54 −11 Original line number Diff line number Diff line Loading @@ -225,7 +225,7 @@ reduce( Operation& operation, } } // reduction of local results // local reduction of unrolled results for( int k = 0; k < n; k++ ) { ResultType* _r = r + 4 * k; operation.commonReduction( _r[ 0 ], _r[ 1 ] ); Loading @@ -242,6 +242,48 @@ reduce( Operation& operation, } else { #endif if( blocks > 1 ) { // initialize array for unrolled results // (it is accessed as a row-major matrix with n rows and 4 columns) ResultType r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = operation.initialValue(); // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; ResultType* _r = r + 4 * k; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); } } } // reduction of the last, incomplete block (not unrolled) for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; ResultType* _r = r + 4 * k; for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( _r[ 0 ], i, _input1, input2 ); } // reduction of unrolled results for( int k = 0; k < n; k++ ) { ResultType* _r = r + 4 * k; operation.commonReduction( _r[ 0 ], _r[ 1 ] ); operation.commonReduction( _r[ 0 ], _r[ 2 ] ); operation.commonReduction( _r[ 0 ], _r[ 3 ] ); // copy the result into the output parameter result[ k ] = _r[ 0 ]; } } else { for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); Loading @@ -259,6 +301,7 @@ reduce( Operation& operation, for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( result[ k ], i, _input1, input2 ); } } #ifdef HAVE_OPENMP } #endif Loading src/TNL/Containers/Algorithms/Reduction_impl.h +36 −8 Original line number Diff line number Diff line Loading @@ -179,15 +179,15 @@ reduce( Operation& operation, typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; #ifdef HAVE_OPENMP constexpr int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); #pragma omp parallel { const int blocks = size / block_size; // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue() }; Loading @@ -209,7 +209,7 @@ reduce( Operation& operation, operation.firstReduction( r[ 0 ], i, input1, input2 ); } // reduction of local results // local reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); Loading @@ -224,10 +224,38 @@ reduce( Operation& operation, } else { #endif if( blocks > 1 ) { // initialize array for unrolled results ResultType r[ 4 ] = { operation.initialValue() }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); // reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); return r[ 0 ]; } else { ResultType result = operation.initialValue(); for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); return result; } #ifdef HAVE_OPENMP } #endif Loading Loading
src/TNL/Containers/Algorithms/Multireduction_impl.h +54 −11 Original line number Diff line number Diff line Loading @@ -225,7 +225,7 @@ reduce( Operation& operation, } } // reduction of local results // local reduction of unrolled results for( int k = 0; k < n; k++ ) { ResultType* _r = r + 4 * k; operation.commonReduction( _r[ 0 ], _r[ 1 ] ); Loading @@ -242,6 +242,48 @@ reduce( Operation& operation, } else { #endif if( blocks > 1 ) { // initialize array for unrolled results // (it is accessed as a row-major matrix with n rows and 4 columns) ResultType r[ n * 4 ]; for( int k = 0; k < n * 4; k++ ) r[ k ] = operation.initialValue(); // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; ResultType* _r = r + 4 * k; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); } } } // reduction of the last, incomplete block (not unrolled) for( int k = 0; k < n; k++ ) { const DataType1* _input1 = input1 + k * ldInput1; ResultType* _r = r + 4 * k; for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( _r[ 0 ], i, _input1, input2 ); } // reduction of unrolled results for( int k = 0; k < n; k++ ) { ResultType* _r = r + 4 * k; operation.commonReduction( _r[ 0 ], _r[ 1 ] ); operation.commonReduction( _r[ 0 ], _r[ 2 ] ); operation.commonReduction( _r[ 0 ], _r[ 3 ] ); // copy the result into the output parameter result[ k ] = _r[ 0 ]; } } else { for( int k = 0; k < n; k++ ) result[ k ] = operation.initialValue(); Loading @@ -259,6 +301,7 @@ reduce( Operation& operation, for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( result[ k ], i, _input1, input2 ); } } #ifdef HAVE_OPENMP } #endif Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +36 −8 Original line number Diff line number Diff line Loading @@ -179,15 +179,15 @@ reduce( Operation& operation, typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; #ifdef HAVE_OPENMP constexpr int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); #pragma omp parallel { const int blocks = size / block_size; // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue() }; Loading @@ -209,7 +209,7 @@ reduce( Operation& operation, operation.firstReduction( r[ 0 ], i, input1, input2 ); } // reduction of local results // local reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); Loading @@ -224,10 +224,38 @@ reduce( Operation& operation, } else { #endif if( blocks > 1 ) { // initialize array for unrolled results ResultType r[ 4 ] = { operation.initialValue() }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); // reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); return r[ 0 ]; } else { ResultType result = operation.initialValue(); for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); return result; } #ifdef HAVE_OPENMP } #endif Loading