Bug 1385982. Don't start kicking off work units during parallel stylo traversal until they're actually full. r?bholley
MozReview-Commit-ID: 8B4yjjGdEFQ
--- a/servo/components/style/parallel.rs
+++ b/servo/components/style/parallel.rs
@@ -132,17 +132,18 @@ fn create_thread_local_context<'scope, E
/// A parallel top-down DOM traversal.
///
/// This algorithm traverses the DOM in a breadth-first, top-down manner. The
/// goals are:
/// * Never process a child before its parent (since child style depends on
/// parent style). If this were to happen, the styling algorithm would panic.
/// * Prioritize discovering nodes as quickly as possible to maximize
-/// opportunities for parallelism.
+/// opportunities for parallelism. But this needs to be weighed against
+/// styling cousins on a single thread to improve sharing.
/// * Style all the children of a given node (i.e. all sibling nodes) on
/// a single thread (with an upper bound to handle nodes with an
/// abnormally large number of children). This is important because we use
/// a thread-local cache to share styles between siblings.
#[inline(always)]
#[allow(unsafe_code)]
fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
recursion_depth: usize,
@@ -168,29 +169,49 @@ fn top_down_dom<'a, 'scope, E, D>(nodes:
let mut tlc = tls.ensure(
|slot: &mut Option<ThreadLocalStyleContext<E>>| create_thread_local_context(traversal, slot));
let mut context = StyleContext {
shared: traversal.shared_context(),
thread_local: &mut *tlc,
};
for n in nodes {
- // If the last node we processed produced children, spawn them off
- // into a work item. We do this at the beginning of the loop (rather
- // than at the end) so that we can traverse the children of the last
- // sibling directly on this thread without a spawn call.
+ // If the last node we processed produced children, we may want to
+ // spawn them off into a work item. We do this at the beginning of
+ // the loop (rather than at the end) so that we can traverse the
+ // children of the last sibling directly on this thread without a
+ // spawn call.
//
// This has the important effect of removing the allocation and
// context-switching overhead of the parallel traversal for perfectly
// linear regions of the DOM, i.e.:
//
// <russian><doll><tag><nesting></nesting></tag></doll></russian>
//
- // Which are not at all uncommon.
- if !discovered_child_nodes.is_empty() {
+ // which are not at all uncommon.
+ //
+ // There's a tension here between spawning off a work item as soon
+ // as discovered_child_nodes is nonempty and waiting until we have a
+ // full work item to do so. The former optimizes for speed of
+ // discovery (we'll start discovering the kids of the things in
+ // "nodes" ASAP). The latter gives us better sharing (e.g. we can
+ // share between cousins much better, because we don't hand them off
+ // as separate work items, which are likely to end up on separate
+ // threads) and gives us a chance to just handle everything on this
+ // thread for small DOM subtrees, as in the linear example above.
+ //
+ // The worst case behavior for waiting until we have a full work
+ // item is a deep tree which has (WORK_UNIT_MAX - 1) "linear"
+ // branches, hence WORK_UNIT_MAX-1 elements at each level. Such a
+ // tree would end up getting processed entirely sequentially,
+ // because we would process each level via our end-of-loop tail
+ // call. If we kicked off a traversal as soon as we discovered
+ // kids, we would instead process such a tree more or less with a
+ // thread-per-branch, multiplexed across our actual threadpool.
+ if discovered_child_nodes.len() >= WORK_UNIT_MAX {
let mut traversal_data_copy = traversal_data.clone();
traversal_data_copy.current_dom_depth += 1;
traverse_nodes(&*discovered_child_nodes,
DispatchMode::NotTailCall,
recursion_depth,
root,
traversal_data_copy,
scope,