Loading src/TNL/Containers/Algorithms/Reduction_impl.h +16 −6 Original line number Diff line number Diff line Loading @@ -188,27 +188,37 @@ reduce( Operation& operation, { const int blocks = size / block_size; // initialize thread-local result variable ResultType r = operation.initialValue(); // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue() }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( IndexType i = 0; i < block_size; i++ ) operation.firstReduction( r, offset + i, input1, input2 ); for( IndexType i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); } } // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r, i, input1, input2 ); operation.firstReduction( r[ 0 ], i, input1, input2 ); } // reduction of local results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); // inter-thread reduction of local results #pragma omp critical { operation.commonReduction( result, r ); operation.commonReduction( result, r[ 0 ] ); } } return result; } Loading Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +16 −6 Original line number Diff line number Diff line Loading @@ -188,27 +188,37 @@ reduce( Operation& operation, { const int blocks = size / block_size; // initialize thread-local result variable ResultType r = operation.initialValue(); // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue() }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const int offset = b * block_size; for( IndexType i = 0; i < block_size; i++ ) operation.firstReduction( r, offset + i, input1, input2 ); for( IndexType i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); } } // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r, i, input1, input2 ); operation.firstReduction( r[ 0 ], i, input1, input2 ); } // reduction of local results operation.commonReduction( r[ 0 ], r[ 1 ] ); operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 0 ], r[ 3 ] ); // inter-thread reduction of local results #pragma omp critical { operation.commonReduction( result, r ); operation.commonReduction( result, r[ 0 ] ); } } return result; } Loading