Bug 1420163 - Update webrender to commit e30886d78c91bdd433fd978a39c511ef9416608e. r?jrmuizel draft
authorKartikaya Gupta <kgupta@mozilla.com>
Tue, 28 Nov 2017 09:40:01 -0500
changeset 704274 e2a18e34a187c47f25169ae1c2113f62abdbb679
parent 704194 5b33b070378ae0806bed0b5e5e34de429a29e7db
child 704275 32881fea548f43300f9a4ac149532e47ad214a6a
push id91138
push userkgupta@mozilla.com
push dateTue, 28 Nov 2017 14:41:35 +0000
reviewersjrmuizel
bugs1420163
milestone59.0a1
Bug 1420163 - Update webrender to commit e30886d78c91bdd433fd978a39c511ef9416608e. r?jrmuizel MozReview-Commit-ID: 1HI5MrCCdn4
gfx/doc/README.webrender
gfx/webrender/res/brush.glsl
gfx/webrender/res/brush_image.glsl
gfx/webrender/res/ellipse.glsl
gfx/webrender/res/prim_shared.glsl
gfx/webrender/res/ps_border_corner.glsl
gfx/webrender/res/ps_border_edge.glsl
gfx/webrender/res/ps_composite.glsl
gfx/webrender/res/ps_gradient.glsl
gfx/webrender/res/ps_rectangle.glsl
gfx/webrender/src/clip.rs
gfx/webrender/src/clip_scroll_node.rs
gfx/webrender/src/clip_scroll_tree.rs
gfx/webrender/src/device.rs
gfx/webrender/src/frame_builder.rs
gfx/webrender/src/glyph_rasterizer.rs
gfx/webrender/src/gpu_types.rs
gfx/webrender/src/lib.rs
gfx/webrender/src/picture.rs
gfx/webrender/src/platform/macos/font.rs
gfx/webrender/src/platform/unix/font.rs
gfx/webrender/src/platform/windows/font.rs
gfx/webrender/src/prim_store.rs
gfx/webrender/src/profiler.rs
gfx/webrender/src/render_backend.rs
gfx/webrender/src/render_task.rs
gfx/webrender/src/renderer.rs
gfx/webrender/src/texture_cache.rs
gfx/webrender/src/tiling.rs
gfx/webrender/src/util.rs
--- a/gfx/doc/README.webrender
+++ b/gfx/doc/README.webrender
@@ -170,9 +170,9 @@ 2. Sometimes autoland tip has changed en
    has an env var you can set to do this). In theory you can get the same
    result by resolving the conflict manually but Cargo.lock files are usually not
    trivial to merge by hand. If it's just the third_party/rust dir that has conflicts
    you can delete it and run |mach vendor rust| again to repopulate it.
 
 -------------------------------------------------------------------------------
 
 The version of WebRender currently in the tree is:
-b7b07562fda338fcb2faff66ce01aafb6235fbcf
+e30886d78c91bdd433fd978a39c511ef9416608e
--- a/gfx/webrender/res/brush.glsl
+++ b/gfx/webrender/res/brush.glsl
@@ -62,32 +62,51 @@ void main(void) {
         // Right now - pictures only support local positions. In the future, this
         // will be expanded to support transform picture types (the common kind).
         device_pos = pic_task.common_data.task_rect.p0 +
                      uDevicePixelRatio * (local_pos - pic_task.content_origin);
 
         // Write the final position transformed by the orthographic device-pixel projection.
         gl_Position = uTransform * vec4(device_pos, 0.0, 1.0);
     } else {
+        VertexInfo vi;
         Layer layer = fetch_layer(brush.clip_node_id, brush.scroll_node_id);
         ClipArea clip_area = fetch_clip_area(brush.clip_address);
 
         // Write the normal vertex information out.
-        // TODO(gw): Support transform types in brushes. For now,
-        //           the old cache image shader didn't support
-        //           them yet anyway, so we're not losing any
-        //           existing functionality.
-        VertexInfo vi = write_vertex(
-            geom.local_rect,
-            geom.local_clip_rect,
-            float(brush.z),
-            layer,
-            pic_task,
-            geom.local_rect
-        );
+        if (layer.is_axis_aligned) {
+            vi = write_vertex(
+                geom.local_rect,
+                geom.local_clip_rect,
+                float(brush.z),
+                layer,
+                pic_task,
+                geom.local_rect
+            );
+
+            // TODO(gw): vLocalBounds may be referenced by
+            //           the fragment shader when running in
+            //           the alpha pass, even on non-transformed
+            //           items. For now, just ensure it has no
+            //           effect. We can tidy this up as we move
+            //           more items to be brush shaders.
+            vLocalBounds = vec4(
+                geom.local_clip_rect.p0,
+                geom.local_clip_rect.p0 + geom.local_clip_rect.size
+            );
+        } else {
+            vi = write_transform_vertex(geom.local_rect,
+                geom.local_rect,
+                geom.local_clip_rect,
+                vec4(1.0),
+                float(brush.z),
+                layer,
+                pic_task
+            );
+        }
 
         local_pos = vi.local_pos;
 
         // For brush instances in the alpha pass, always write
         // out clip information.
         // TODO(gw): It's possible that we might want alpha
         //           shaders that don't clip in the future,
         //           but it's reasonable to assume that one
--- a/gfx/webrender/res/brush_image.glsl
+++ b/gfx/webrender/res/brush_image.glsl
@@ -1,14 +1,18 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include shared,prim_shared,brush
 
+#ifdef WR_FEATURE_ALPHA_PASS
+varying vec2 vLocalPos;
+#endif
+
 varying vec3 vUv;
 flat varying int vImageKind;
 flat varying vec4 vUvBounds;
 flat varying vec4 vUvBounds_NoClamp;
 flat varying vec4 vParams;
 
 #if defined WR_FEATURE_ALPHA_TARGET
 flat varying vec4 vColor;
@@ -71,16 +75,20 @@ void brush_vs(
             vUv.xy = (local_pos - local_rect.p0) / local_src_size;
             vParams.xy = 0.5 * local_rect.size / local_src_size;
             break;
         }
     }
 
     vUvBounds = vec4(uv0 + vec2(0.5), uv1 - vec2(0.5)) / texture_size.xyxy;
     vUvBounds_NoClamp = vec4(uv0, uv1) / texture_size.xyxy;
+
+#ifdef WR_FEATURE_ALPHA_PASS
+    vLocalPos = local_pos;
+#endif
 }
 #endif
 
 #ifdef WR_FRAGMENT_SHADER
 vec4 brush_fs() {
     vec2 uv;
 
     switch (vImageKind) {
@@ -111,11 +119,15 @@ vec4 brush_fs() {
     }
 
 #if defined WR_FEATURE_COLOR_TARGET
     vec4 color = texture(sColor0, vec3(uv, vUv.z));
 #else
     vec4 color = vColor * texture(sColor1, vec3(uv, vUv.z)).r;
 #endif
 
+#ifdef WR_FEATURE_ALPHA_PASS
+    color *= init_transform_fs(vLocalPos);
+#endif
+
     return color;
 }
 #endif
--- a/gfx/webrender/res/ellipse.glsl
+++ b/gfx/webrender/res/ellipse.glsl
@@ -67,22 +67,22 @@ float distance_to_ellipse(vec2 p, vec2 r
 }
 
 float clip_against_ellipse_if_needed(
     vec2 pos,
     float current_distance,
     vec4 ellipse_center_radius,
     vec2 sign_modifier
 ) {
-    float ellipse_distance = distance_to_ellipse(pos - ellipse_center_radius.xy,
-                                                 ellipse_center_radius.zw);
+    if (!all(lessThan(sign_modifier * pos, sign_modifier * ellipse_center_radius.xy))) {
+      return current_distance;
+    }
 
-    return mix(current_distance,
-               ellipse_distance,
-               all(lessThan(sign_modifier * pos, sign_modifier * ellipse_center_radius.xy)));
+    return distance_to_ellipse(pos - ellipse_center_radius.xy,
+                               ellipse_center_radius.zw);
 }
 
 float rounded_rect(vec2 pos,
                    vec4 clip_center_radius_tl,
                    vec4 clip_center_radius_tr,
                    vec4 clip_center_radius_br,
                    vec4 clip_center_radius_bl,
                    float aa_range) {
--- a/gfx/webrender/res/prim_shared.glsl
+++ b/gfx/webrender/res/prim_shared.glsl
@@ -31,19 +31,17 @@ vec2 clamp_rect(vec2 point, RectWithSize
 float distance_to_line(vec2 p0, vec2 perp_dir, vec2 p) {
     vec2 dir_to_p0 = p0 - p;
     return dot(normalize(perp_dir), dir_to_p0);
 }
 
 // TODO: convert back to RectWithEndPoint if driver issues are resolved, if ever.
 flat varying vec4 vClipMaskUvBounds;
 varying vec3 vClipMaskUv;
-#ifdef WR_FEATURE_TRANSFORM
-    flat varying vec4 vLocalBounds;
-#endif
+flat varying vec4 vLocalBounds;
 
 // TODO(gw): This is here temporarily while we have
 //           both GPU store and cache. When the GPU
 //           store code is removed, we can change the
 //           PrimitiveInstance instance structure to
 //           use 2x unsigned shorts as vertex attributes
 //           instead of an int, and encode the UV directly
 //           in the vertices.
@@ -66,17 +64,17 @@ vec4[2] fetch_from_resource_cache_2(int 
     return vec4[2](
         TEXEL_FETCH(sResourceCache, uv, 0, ivec2(0, 0)),
         TEXEL_FETCH(sResourceCache, uv, 0, ivec2(1, 0))
     );
 }
 
 #ifdef WR_VERTEX_SHADER
 
-#define VECS_PER_LAYER              10
+#define VECS_PER_LAYER              11
 #define VECS_PER_RENDER_TASK        3
 #define VECS_PER_PRIM_HEADER        2
 #define VECS_PER_TEXT_RUN           3
 #define VECS_PER_GRADIENT           3
 #define VECS_PER_GRADIENT_STOP      2
 
 uniform HIGHP_SAMPLER_FLOAT sampler2D sClipScrollNodes;
 uniform HIGHP_SAMPLER_FLOAT sampler2D sRenderTasks;
@@ -144,16 +142,17 @@ vec4 fetch_from_resource_cache_1(int add
 }
 
 struct ClipScrollNode {
     mat4 transform;
     mat4 inv_transform;
     vec4 local_clip_rect;
     vec2 reference_frame_relative_scroll_offset;
     vec2 scroll_offset;
+    bool is_axis_aligned;
 };
 
 ClipScrollNode fetch_clip_scroll_node(int index) {
     ClipScrollNode node;
 
     // Create a UV base coord for each 8 texels.
     // This is required because trying to use an offset
     // of more than 8 texels doesn't work on some versions
@@ -174,39 +173,44 @@ ClipScrollNode fetch_clip_scroll_node(in
 
     vec4 clip_rect = TEXEL_FETCH(sClipScrollNodes, uv1, 0, ivec2(0, 0));
     node.local_clip_rect = clip_rect;
 
     vec4 offsets = TEXEL_FETCH(sClipScrollNodes, uv1, 0, ivec2(1, 0));
     node.reference_frame_relative_scroll_offset = offsets.xy;
     node.scroll_offset = offsets.zw;
 
+    vec4 misc = TEXEL_FETCH(sClipScrollNodes, uv1, 0, ivec2(2, 0));
+    node.is_axis_aligned = misc.x == 0.0;
+
     return node;
 }
 
 struct Layer {
     mat4 transform;
     mat4 inv_transform;
     RectWithSize local_clip_rect;
+    bool is_axis_aligned;
 };
 
 Layer fetch_layer(int clip_node_id, int scroll_node_id) {
     ClipScrollNode clip_node = fetch_clip_scroll_node(clip_node_id);
     ClipScrollNode scroll_node = fetch_clip_scroll_node(scroll_node_id);
 
     Layer layer;
     layer.transform = scroll_node.transform;
     layer.inv_transform = scroll_node.inv_transform;
 
     vec4 local_clip_rect = clip_node.local_clip_rect;
     local_clip_rect.xy += clip_node.reference_frame_relative_scroll_offset;
     local_clip_rect.xy -= scroll_node.reference_frame_relative_scroll_offset;
     local_clip_rect.xy -= scroll_node.scroll_offset;
 
     layer.local_clip_rect = RectWithSize(local_clip_rect.xy, local_clip_rect.zw);
+    layer.is_axis_aligned = scroll_node.is_axis_aligned;
 
     return layer;
 }
 
 struct RenderTaskCommonData {
     RectWithSize task_rect;
     float texture_layer_index;
 };
@@ -610,18 +614,16 @@ VertexInfo write_vertex(RectWithSize ins
                      task.common_data.task_rect.p0;
 
     gl_Position = uTransform * vec4(final_pos, z, 1.0);
 
     VertexInfo vi = VertexInfo(clamped_local_pos, device_pos);
     return vi;
 }
 
-#ifdef WR_FEATURE_TRANSFORM
-
 float cross2(vec2 v0, vec2 v1) {
     return v0.x * v1.y - v0.y * v1.x;
 }
 
 // Return intersection of line (p0,p1) and line (p2,p3)
 vec2 intersect_lines(vec2 p0, vec2 p1, vec2 p2, vec2 p3) {
     vec2 d0 = p0 - p1;
     vec2 d1 = p2 - p3;
@@ -631,46 +633,52 @@ vec2 intersect_lines(vec2 p0, vec2 p1, v
 
     float d = cross2(d0, d1);
     float nx = s0 * d1.x - d0.x * s1;
     float ny = s0 * d1.y - d0.y * s1;
 
     return vec2(nx / d, ny / d);
 }
 
-VertexInfo write_transform_vertex(RectWithSize instance_rect,
+VertexInfo write_transform_vertex(RectWithSize local_segment_rect,
+                                  RectWithSize local_prim_rect,
                                   RectWithSize local_clip_rect,
                                   vec4 clip_edge_mask,
                                   float z,
                                   Layer layer,
                                   PictureTask task) {
     // Calculate a clip rect from local clip + layer clip.
     RectWithEndpoint clip_rect = to_rect_with_endpoint(local_clip_rect);
     clip_rect.p0 = clamp_rect(clip_rect.p0, layer.local_clip_rect);
     clip_rect.p1 = clamp_rect(clip_rect.p1, layer.local_clip_rect);
 
     // Calculate a clip rect from local_rect + local clip + layer clip.
-    RectWithEndpoint local_rect = to_rect_with_endpoint(instance_rect);
-    local_rect.p0 = clamp(local_rect.p0, clip_rect.p0, clip_rect.p1);
-    local_rect.p1 = clamp(local_rect.p1, clip_rect.p0, clip_rect.p1);
+    RectWithEndpoint segment_rect = to_rect_with_endpoint(local_segment_rect);
+    segment_rect.p0 = clamp(segment_rect.p0, clip_rect.p0, clip_rect.p1);
+    segment_rect.p1 = clamp(segment_rect.p1, clip_rect.p0, clip_rect.p1);
+
+    // Calculate a clip rect from local_rect + local clip + layer clip.
+    RectWithEndpoint prim_rect = to_rect_with_endpoint(local_prim_rect);
+    prim_rect.p0 = clamp(prim_rect.p0, clip_rect.p0, clip_rect.p1);
+    prim_rect.p1 = clamp(prim_rect.p1, clip_rect.p0, clip_rect.p1);
 
     // As this is a transform shader, extrude by 2 (local space) pixels
     // in each direction. This gives enough space around the edge to
     // apply distance anti-aliasing. Technically, it:
     // (a) slightly over-estimates the number of required pixels in the simple case.
     // (b) might not provide enough edge in edge case perspective projections.
     // However, it's fast and simple. If / when we ever run into issues, we
     // can do some math on the projection matrix to work out a variable
     // amount to extrude.
     float extrude_distance = 2.0;
-    instance_rect.p0 -= vec2(extrude_distance);
-    instance_rect.size += vec2(2.0 * extrude_distance);
+    local_segment_rect.p0 -= vec2(extrude_distance);
+    local_segment_rect.size += vec2(2.0 * extrude_distance);
 
     // Select the corner of the local rect that we are processing.
-    vec2 local_pos = instance_rect.p0 + instance_rect.size * aPosition.xy;
+    vec2 local_pos = local_segment_rect.p0 + local_segment_rect.size * aPosition.xy;
 
     // Transform the current vertex to the world cpace.
     vec4 world_pos = layer.transform * vec4(local_pos, 0.0, 1.0);
 
     // Convert the world positions to device pixel space.
     vec2 device_pos = world_pos.xy / world_pos.w * uDevicePixelRatio;
 
     // We want the world space coords to be perspective divided by W.
@@ -678,38 +686,37 @@ VertexInfo write_transform_vertex(RectWi
     // want a constant Z across the primitive, since we're using it
     // for draw ordering - so scale by the W coord to ensure this.
     vec4 final_pos = vec4(world_pos.xy + task.common_data.task_rect.p0 - task.content_origin,
                           z * world_pos.w,
                           world_pos.w);
     gl_Position = uTransform * final_pos;
 
     vLocalBounds = mix(
-        vec4(clip_rect.p0, clip_rect.p1),
-        vec4(local_rect.p0, local_rect.p1),
+        vec4(prim_rect.p0, prim_rect.p1),
+        vec4(segment_rect.p0, segment_rect.p1),
         clip_edge_mask
     );
 
     VertexInfo vi = VertexInfo(local_pos, device_pos);
     return vi;
 }
 
 VertexInfo write_transform_vertex_primitive(Primitive prim) {
     return write_transform_vertex(
         prim.local_rect,
+        prim.local_rect,
         prim.local_clip_rect,
-        vec4(1.0),
+        vec4(0.0),
         prim.z,
         prim.layer,
         prim.task
     );
 }
 
-#endif //WR_FEATURE_TRANSFORM
-
 struct GlyphResource {
     vec4 uv_rect;
     float layer;
     vec2 offset;
     float scale;
 };
 
 GlyphResource fetch_glyph_resource(int address) {
@@ -804,17 +811,16 @@ float compute_aa_range(vec2 position) {
 
 /// Return the blending coefficient to for distance antialiasing.
 ///
 /// 0.0 means inside the shape, 1.0 means outside.
 float distance_aa(float aa_range, float signed_distance) {
     return 1.0 - smoothstep(-aa_range, aa_range, signed_distance);
 }
 
-#ifdef WR_FEATURE_TRANSFORM
 float signed_distance_rect(vec2 pos, vec2 p0, vec2 p1) {
     vec2 d = max(p0 - pos, pos - p1);
     return length(max(vec2(0.0), d)) + min(0.0, max(d.x, d.y));
 }
 
 float init_transform_fs(vec2 local_pos) {
     // Get signed distance from local rect bounds.
     float d = signed_distance_rect(
@@ -824,17 +830,16 @@ float init_transform_fs(vec2 local_pos) 
     );
 
     // Find the appropriate distance to apply the AA smoothstep over.
     float aa_range = compute_aa_range(local_pos);
 
     // Only apply AA to fragments outside the signed distance field.
     return distance_aa(aa_range, d);
 }
-#endif //WR_FEATURE_TRANSFORM
 
 float do_clip() {
     // anything outside of the mask is considered transparent
     bvec4 inside = lessThanEqual(
         vec4(vClipMaskUvBounds.xy, vClipMaskUv.xy),
         vec4(vClipMaskUv.xy, vClipMaskUvBounds.zw));
     // check for the dummy bounds, which are given to the opaque objects
     return vClipMaskUvBounds.xy == vClipMaskUvBounds.zw ? 1.0:
--- a/gfx/webrender/res/ps_border_corner.glsl
+++ b/gfx/webrender/res/ps_border_corner.glsl
@@ -294,16 +294,17 @@ void main(void) {
     write_color(color0, color1, style, color_delta, prim.user_data1);
 
     RectWithSize segment_rect;
     segment_rect.p0 = p0;
     segment_rect.size = p1 - p0;
 
 #ifdef WR_FEATURE_TRANSFORM
     VertexInfo vi = write_transform_vertex(segment_rect,
+                                           prim.local_rect,
                                            prim.local_clip_rect,
                                            vec4(1.0),
                                            prim.z,
                                            prim.layer,
                                            prim.task);
 #else
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
--- a/gfx/webrender/res/ps_border_edge.glsl
+++ b/gfx/webrender/res/ps_border_edge.glsl
@@ -212,16 +212,17 @@ void main(void) {
     }
 
     write_alpha_select(style);
     write_color0(color, style, color_flip);
     write_color1(color, style, color_flip);
 
 #ifdef WR_FEATURE_TRANSFORM
     VertexInfo vi = write_transform_vertex(segment_rect,
+                                           prim.local_rect,
                                            prim.local_clip_rect,
                                            vec4(1.0),
                                            prim.z,
                                            prim.layer,
                                            prim.task);
 #else
     VertexInfo vi = write_vertex(segment_rect,
                                  prim.local_clip_rect,
--- a/gfx/webrender/res/ps_composite.glsl
+++ b/gfx/webrender/res/ps_composite.glsl
@@ -202,29 +202,29 @@ const int MixBlendMode_Hue         = 12;
 const int MixBlendMode_Saturation  = 13;
 const int MixBlendMode_Color       = 14;
 const int MixBlendMode_Luminosity  = 15;
 
 void main(void) {
     vec4 Cb = texture(sCacheRGBA8, vUv0);
     vec4 Cs = texture(sCacheRGBA8, vUv1);
 
-    // The mix-blend-mode functions assume no premultiplied alpha
-    Cb.rgb /= Cb.a;
-    Cs.rgb /= Cs.a;
-
     if (Cb.a == 0.0) {
         oFragColor = Cs;
         return;
     }
     if (Cs.a == 0.0) {
         oFragColor = vec4(0.0, 0.0, 0.0, 0.0);
         return;
     }
 
+    // The mix-blend-mode functions assume no premultiplied alpha
+    Cb.rgb /= Cb.a;
+    Cs.rgb /= Cs.a;
+
     // Return yellow if none of the branches match (shouldn't happen).
     vec4 result = vec4(1.0, 1.0, 0.0, 1.0);
 
     switch (vOp) {
         case MixBlendMode_Multiply:
             result.rgb = Multiply(Cb.rgb, Cs.rgb);
             break;
         case MixBlendMode_Screen:
--- a/gfx/webrender/res/ps_gradient.glsl
+++ b/gfx/webrender/res/ps_gradient.glsl
@@ -63,16 +63,17 @@ void main(void) {
         // Adjust the stop colors by how much they were clamped
         vec2 adjusted_offset = (g01_y_clamped - g01_y.xx) / (g01_y.y - g01_y.x);
         adjusted_color_g0 = mix(g0.color, g1.color, adjusted_offset.x);
         adjusted_color_g1 = mix(g0.color, g1.color, adjusted_offset.y);
     }
 
 #ifdef WR_FEATURE_TRANSFORM
     VertexInfo vi = write_transform_vertex(segment_rect,
+                                           prim.local_rect,
                                            prim.local_clip_rect,
                                            vec4(1.0),
                                            prim.z,
                                            prim.layer,
                                            prim.task);
     vLocalPos = vi.local_pos;
     vec2 f = (vi.local_pos.xy - prim.local_rect.p0) / prim.local_rect.size;
 #else
--- a/gfx/webrender/res/ps_rectangle.glsl
+++ b/gfx/webrender/res/ps_rectangle.glsl
@@ -12,16 +12,17 @@ varying vec2 vLocalPos;
 
 #ifdef WR_VERTEX_SHADER
 void main(void) {
     Primitive prim = load_primitive();
     Rectangle rect = fetch_rectangle(prim.specific_prim_address);
     vColor = rect.color;
 #ifdef WR_FEATURE_TRANSFORM
     VertexInfo vi = write_transform_vertex(prim.local_rect,
+                                           prim.local_rect,
                                            prim.local_clip_rect,
                                            rect.edge_aa_segment_mask,
                                            prim.z,
                                            prim.layer,
                                            prim.task);
     vLocalPos = vi.local_pos;
 #else
     VertexInfo vi = write_vertex(prim.local_rect,
--- a/gfx/webrender/src/clip.rs
+++ b/gfx/webrender/src/clip.rs
@@ -1,24 +1,22 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{BorderRadius, ComplexClipRegion, DeviceIntRect, ImageMask, ImageRendering, LayerPoint};
-use api::{ClipMode, LayerRect, LayerSize};
+use api::{ClipMode, LayerRect};
 use api::{LayerToWorldTransform, LayoutPoint, LayoutVector2D, LocalClip};
 use border::BorderCornerClipSource;
 use ellipse::Ellipse;
 use freelist::{FreeList, FreeListHandle, WeakFreeListHandle};
 use gpu_cache::{GpuCache, GpuCacheHandle, ToGpuBlocks};
 use prim_store::{ClipData, ImageMaskData};
 use resource_cache::ResourceCache;
-use util::{extract_inner_rect_safe, TransformedRect};
-
-pub const MAX_CLIP: f32 = 1000000.0;
+use util::{extract_inner_rect_safe, MaxRect, TransformedRect};
 
 pub type ClipStore = FreeList<ClipSources>;
 pub type ClipSourcesHandle = FreeListHandle<ClipSources>;
 pub type ClipSourcesWeakHandle = WeakFreeListHandle<ClipSources>;
 
 #[derive(Clone, Debug)]
 pub struct ClipRegion {
     pub main: LayerRect,
@@ -98,162 +96,175 @@ impl From<ClipRegion> for ClipSources {
         }
 
         ClipSources::new(clips)
     }
 }
 
 impl ClipSource {
     pub fn contains(&self, point: &LayerPoint) -> bool {
-        // We currently do not handle all types of clip sources, because they
-        // aren't used for ClipScrollNodes and this method is only used during hit testing.
+        // We currently do not handle all BorderCorners, because they aren't used for
+        // ClipScrollNodes and this method is only used during hit testing.
         match self {
             &ClipSource::Rectangle(ref rectangle) => rectangle.contains(point),
             &ClipSource::RoundedRectangle(rect, radii, ClipMode::Clip) =>
                 rounded_rectangle_contains_point(point, &rect, &radii),
             &ClipSource::RoundedRectangle(rect, radii, ClipMode::ClipOut) =>
                 !rounded_rectangle_contains_point(point, &rect, &radii),
-            _ => unreachable!("Tried to call contains on an unsupported ClipSource."),
+            &ClipSource::Image(mask) => mask.rect.contains(point),
+            &ClipSource::BorderCorner(_) =>
+                unreachable!("Tried to call contains on a BorderCornerr."),
         }
     }
 
 }
 
 #[derive(Debug)]
 pub struct ClipSources {
     pub clips: Vec<(ClipSource, GpuCacheHandle)>,
-    pub bounds: MaskBounds,
+    pub local_inner_rect: LayerRect,
+    pub local_outer_rect: Option<LayerRect>
 }
 
 impl ClipSources {
     pub fn new(clips: Vec<ClipSource>) -> ClipSources {
+        let (local_inner_rect, local_outer_rect) = Self::calculate_inner_and_outer_rects(&clips);
+
         let clips = clips
             .into_iter()
             .map(|clip| (clip, GpuCacheHandle::new()))
             .collect();
 
         ClipSources {
             clips,
-            bounds: MaskBounds {
-                inner: None,
-                outer: None,
-            },
+            local_inner_rect,
+            local_outer_rect,
         }
     }
 
     pub fn clips(&self) -> &[(ClipSource, GpuCacheHandle)] {
         &self.clips
     }
 
-    pub fn update(
-        &mut self,
-        layer_transform: &LayerToWorldTransform,
-        gpu_cache: &mut GpuCache,
-        resource_cache: &mut ResourceCache,
-        device_pixel_ratio: f32,
-    ) {
-        if self.clips.is_empty() {
-            return;
+    fn calculate_inner_and_outer_rects(clips: &Vec<ClipSource>) -> (LayerRect, Option<LayerRect>) {
+        if clips.is_empty() {
+            return (LayerRect::zero(), None);
         }
 
-        // compute the local bounds
-        if self.bounds.inner.is_none() {
-            let mut local_rect = Some(LayerRect::new(
-                LayerPoint::new(-MAX_CLIP, -MAX_CLIP),
-                LayerSize::new(2.0 * MAX_CLIP, 2.0 * MAX_CLIP),
-            ));
-            let mut local_inner = local_rect;
-            let mut has_clip_out = false;
-            let mut has_border_clip = false;
-
-            for &(ref source, _) in &self.clips {
-                match *source {
-                    ClipSource::Image(ref mask) => {
-                        if !mask.repeat {
-                            local_rect = local_rect.and_then(|r| r.intersection(&mask.rect));
-                        }
-                        local_inner = None;
+        // Depending on the complexity of the clip, we may either know the outer and/or inner
+        // rect, or neither or these.  In the case of a clip-out, we currently set the mask bounds
+        // to be unknown. This is conservative, but ensures correctness. In the future we can make
+        // this a lot more clever with some proper region handling.
+        let mut local_outer = Some(LayerRect::max_rect());
+        let mut local_inner = local_outer;
+        let mut can_calculate_inner_rect = true;
+        let mut can_calculate_outer_rect = true;
+        for source in clips {
+            match *source {
+                ClipSource::Image(ref mask) => {
+                    if !mask.repeat {
+                        local_outer = local_outer.and_then(|r| r.intersection(&mask.rect));
+                        can_calculate_inner_rect = false;
+                    } else {
+                        can_calculate_inner_rect = false;
+                        can_calculate_outer_rect = false;
+                        break;
                     }
-                    ClipSource::Rectangle(rect) => {
-                        local_rect = local_rect.and_then(|r| r.intersection(&rect));
-                        local_inner = local_inner.and_then(|r| r.intersection(&rect));
+                    local_inner = None;
+                }
+                ClipSource::Rectangle(rect) => {
+                    local_outer = local_outer.and_then(|r| r.intersection(&rect));
+                    local_inner = local_inner.and_then(|r| r.intersection(&rect));
+                }
+                ClipSource::RoundedRectangle(ref rect, ref radius, mode) => {
+                    // Once we encounter a clip-out, we just assume the worst
+                    // case clip mask size, for now.
+                    if mode == ClipMode::ClipOut {
+                        can_calculate_inner_rect = false;
+                        can_calculate_outer_rect = false;
+                        break;
                     }
-                    ClipSource::RoundedRectangle(ref rect, ref radius, mode) => {
-                        // Once we encounter a clip-out, we just assume the worst
-                        // case clip mask size, for now.
-                        if mode == ClipMode::ClipOut {
-                            has_clip_out = true;
-                        }
+
+                    local_outer = local_outer.and_then(|r| r.intersection(rect));
 
-                        local_rect = local_rect.and_then(|r| r.intersection(rect));
-
-                        let inner_rect = extract_inner_rect_safe(rect, radius);
-                        local_inner = local_inner
-                            .and_then(|r| inner_rect.and_then(|ref inner| r.intersection(inner)));
-                    }
-                    ClipSource::BorderCorner { .. } => {
-                        has_border_clip = true;
-                    }
+                    let inner_rect = extract_inner_rect_safe(rect, radius);
+                    local_inner = local_inner
+                        .and_then(|r| inner_rect.and_then(|ref inner| r.intersection(inner)));
+                }
+                ClipSource::BorderCorner { .. } => {
+                    can_calculate_inner_rect = false;
+                    can_calculate_outer_rect = false;
+                    break;
                 }
             }
-
-            // Work out the type of mask geometry we have, based on the
-            // list of clip sources above.
-            self.bounds = if has_clip_out || has_border_clip {
-                // For clip-out, the mask rect is not known.
-                MaskBounds {
-                    outer: None,
-                    inner: Some(LayerRect::zero().into()),
-                }
-            } else {
-                MaskBounds {
-                    outer: Some(local_rect.unwrap_or(LayerRect::zero()).into()),
-                    inner: Some(local_inner.unwrap_or(LayerRect::zero()).into()),
-                }
-            };
         }
 
-        // update the screen bounds
-        self.bounds.update(layer_transform, device_pixel_ratio);
+        let outer = match can_calculate_outer_rect {
+            true => local_outer,
+            false => None,
+        };
 
+        let inner = match can_calculate_inner_rect {
+            true => local_inner.unwrap_or(LayerRect::zero()),
+            false => LayerRect::zero(),
+        };
+
+        (inner, outer)
+    }
+
+    pub fn update(
+        &mut self,
+        gpu_cache: &mut GpuCache,
+        resource_cache: &mut ResourceCache,
+    ) {
         for &mut (ref mut source, ref mut handle) in &mut self.clips {
             if let Some(mut request) = gpu_cache.request(handle) {
                 match *source {
                     ClipSource::Image(ref mask) => {
-                        let data = ImageMaskData {
-                            local_rect: mask.rect,
-                        };
+                        let data = ImageMaskData { local_rect: mask.rect };
                         data.write_gpu_blocks(request);
                     }
                     ClipSource::Rectangle(rect) => {
                         let data = ClipData::uniform(rect, 0.0, ClipMode::Clip);
                         data.write(&mut request);
                     }
                     ClipSource::RoundedRectangle(ref rect, ref radius, mode) => {
                         let data = ClipData::rounded_rect(rect, radius, mode);
                         data.write(&mut request);
                     }
                     ClipSource::BorderCorner(ref mut source) => {
                         source.write(request);
                     }
                 }
             }
-        }
 
-        for &(ref clip, _) in &self.clips {
-            if let ClipSource::Image(ref mask) = *clip {
+            if let ClipSource::Image(ref mask) = *source {
                 resource_cache.request_image(mask.image, ImageRendering::Auto, None, gpu_cache);
             }
         }
     }
 
     /// Whether or not this ClipSources has any clips (does any clipping).
     pub fn has_clips(&self) -> bool {
         !self.clips.is_empty()
     }
+
+    pub fn get_screen_bounds(
+        &self,
+        transform: &LayerToWorldTransform,
+        device_pixel_ratio: f32,
+    ) -> (DeviceIntRect, Option<DeviceIntRect>) {
+        let screen_inner_rect =
+            TransformedRect::new(&self.local_inner_rect, transform, device_pixel_ratio);
+        let screen_outer_rect = self.local_outer_rect.map(|outer_rect|
+            TransformedRect::new(&outer_rect, transform, device_pixel_ratio).bounding_rect
+        );
+
+        (screen_inner_rect.bounding_rect, screen_outer_rect)
+    }
 }
 
 /// Represents a local rect and a device space
 /// rectangles that are either outside or inside bounds.
 #[derive(Clone, Debug, PartialEq)]
 pub struct Geometry {
     pub local_rect: LayerRect,
     pub device_rect: DeviceIntRect,
@@ -263,43 +274,16 @@ impl From<LayerRect> for Geometry {
     fn from(local_rect: LayerRect) -> Self {
         Geometry {
             local_rect,
             device_rect: DeviceIntRect::zero(),
         }
     }
 }
 
-/// Depending on the complexity of the clip, we may either
-/// know the outer and/or inner rect, or neither or these.
-/// In the case of a clip-out, we currently set the mask
-/// bounds to be unknown. This is conservative, but ensures
-/// correctness. In the future we can make this a lot
-/// more clever with some proper region handling.
-#[derive(Clone, Debug, PartialEq)]
-pub struct MaskBounds {
-    pub outer: Option<Geometry>,
-    pub inner: Option<Geometry>,
-}
-
-impl MaskBounds {
-    pub fn update(&mut self, transform: &LayerToWorldTransform, device_pixel_ratio: f32) {
-        if let Some(ref mut outer) = self.outer {
-            let transformed =
-                TransformedRect::new(&outer.local_rect, transform, device_pixel_ratio);
-            outer.device_rect = transformed.bounding_rect;
-        }
-        if let Some(ref mut inner) = self.inner {
-            let transformed =
-                TransformedRect::new(&inner.local_rect, transform, device_pixel_ratio);
-            inner.device_rect = transformed.inner_rect;
-        }
-    }
-}
-
 pub trait Contains {
     fn contains(&self, point: &LayoutPoint) -> bool;
 }
 
 impl Contains for LocalClip {
     fn contains(&self, point: &LayoutPoint) -> bool {
         if !self.clip_rect().contains(point) {
             return false;
--- a/gfx/webrender/src/clip_scroll_node.rs
+++ b/gfx/webrender/src/clip_scroll_node.rs
@@ -12,26 +12,24 @@ use euclid::SideOffsets2D;
 use geometry::ray_intersects_rect;
 use gpu_cache::GpuCache;
 use gpu_types::{ClipScrollNodeIndex, ClipScrollNodeData};
 use render_task::{ClipChain, ClipChainNode, ClipWorkItem};
 use resource_cache::ResourceCache;
 use scene::SceneProperties;
 use spring::{DAMPING, STIFFNESS, Spring};
 use std::rc::Rc;
-use util::{MatrixHelpers, MaxRect};
+use util::{MatrixHelpers, MaxRect, TransformedRectKind};
 
 #[cfg(target_os = "macos")]
 const CAN_OVERSCROLL: bool = true;
 
 #[cfg(not(target_os = "macos"))]
 const CAN_OVERSCROLL: bool = false;
 
-const MAX_LOCAL_VIEWPORT: f32 = 1000000.0;
-
 #[derive(Debug)]
 pub struct StickyFrameInfo {
     pub margins: SideOffsets2D<Option<f32>>,
     pub vertical_offset_bounds: StickyOffsetBounds,
     pub horizontal_offset_bounds: StickyOffsetBounds,
     pub previously_applied_offset: LayoutVector2D,
     pub current_offset: LayerVector2D,
 }
@@ -289,33 +287,38 @@ impl ClipScrollNode {
             state,
             device_pixel_ratio,
             clip_store,
             resource_cache,
             gpu_cache,
         );
 
         let local_clip_rect = if self.world_content_transform.has_perspective_component() {
-            LayerRect::new(
-                LayerPoint::new(-MAX_LOCAL_VIEWPORT, -MAX_LOCAL_VIEWPORT),
-                LayerSize::new(2.0 * MAX_LOCAL_VIEWPORT, 2.0 * MAX_LOCAL_VIEWPORT)
-            )
+            LayerRect::max_rect()
         } else {
             self.combined_local_viewport_rect
         };
 
         let data = match self.world_content_transform.inverse() {
             Some(inverse) => {
+                let transform_kind = if self.world_content_transform.preserves_2d_axis_alignment() {
+                    TransformedRectKind::AxisAligned
+                } else {
+                    TransformedRectKind::Complex
+                };
+
                 ClipScrollNodeData {
                     transform: self.world_content_transform,
                     inv_transform: inverse,
                     local_clip_rect,
                     reference_frame_relative_scroll_offset:
                         self.reference_frame_relative_scroll_offset,
                     scroll_offset: self.scroll_offset(),
+                    transform_kind: transform_kind as u32 as f32,
+                    padding: [0.0; 3],
                 }
             }
             None => {
                 state.combined_outer_clip_bounds = DeviceIntRect::zero();
                 self.combined_clip_outer_bounds = DeviceIntRect::zero();
                 ClipScrollNodeData::invalid()
             }
         };
@@ -327,53 +330,74 @@ impl ClipScrollNode {
     pub fn update_clip_work_item(
         &mut self,
         state: &mut TransformUpdateState,
         device_pixel_ratio: f32,
         clip_store: &mut ClipStore,
         resource_cache: &mut ResourceCache,
         gpu_cache: &mut GpuCache,
     ) {
-        let current_clip_chain = state.parent_clip_chain.clone();
+        let mut current_clip_chain = state.parent_clip_chain.clone();
         let clip_sources_handle = match self.node_type {
             NodeType::Clip(ref handle) => handle,
             _ => {
                 self.clip_chain_node = current_clip_chain;
                 self.combined_clip_outer_bounds = state.combined_outer_clip_bounds;
                 return;
             }
         };
 
         let clip_sources = clip_store.get_mut(clip_sources_handle);
-        clip_sources.update(
-            &self.world_viewport_transform,
-            gpu_cache,
-            resource_cache,
-            device_pixel_ratio,
-        );
+        clip_sources.update(gpu_cache, resource_cache);
+        let (screen_inner_rect, screen_outer_rect) =
+            clip_sources.get_screen_bounds(&self.world_viewport_transform, device_pixel_ratio);
+
+        // If this clip's inner rectangle completely surrounds the existing clip
+        // chain's outer rectangle, we can discard this clip entirely since it isn't
+        // going to affect anything.
+        if screen_inner_rect.contains_rect(&state.combined_outer_clip_bounds) {
+            self.clip_chain_node = current_clip_chain;
+            self.combined_clip_outer_bounds = state.combined_outer_clip_bounds;
+            return;
+        }
 
-        let outer_bounds = clip_sources.bounds.outer.as_ref().map_or_else(
-            DeviceIntRect::zero,
-            |rect| rect.device_rect
-        );
+        let combined_outer_screen_rect = match screen_outer_rect {
+            Some(outer_rect) => {
+                // If this clips outer rectangle is completely enclosed by the clip
+                // chain's inner rectangle, then the only clip that matters from this point
+                // on is this clip. We can disconnect this clip from the parent clip chain.
+                if state.combined_inner_clip_bounds.contains_rect(&outer_rect) {
+                    current_clip_chain = None;
+                }
+                outer_rect.intersection(&state.combined_outer_clip_bounds)
+                    .unwrap_or_else(DeviceIntRect::zero)
+            }
+            None => state.combined_outer_clip_bounds,
+        };
 
-        self.combined_clip_outer_bounds = outer_bounds.intersection(
-            &state.combined_outer_clip_bounds).unwrap_or_else(DeviceIntRect::zero);
+        let combined_inner_screen_rect =
+            state.combined_inner_clip_bounds.intersection(&screen_inner_rect)
+            .unwrap_or_else(DeviceIntRect::zero);
 
-        // TODO: Combine rectangles in the same axis-aligned clip space here?
+        state.combined_outer_clip_bounds = combined_outer_screen_rect;
+        state.combined_inner_clip_bounds = combined_inner_screen_rect;
+        self.combined_clip_outer_bounds = combined_outer_screen_rect;
+
         self.clip_chain_node = Some(Rc::new(ClipChainNode {
             work_item: ClipWorkItem {
                 scroll_node_data_index: self.node_data_index,
                 clip_sources: clip_sources_handle.weak(),
                 coordinate_system_id: state.current_coordinate_system_id,
             },
+            screen_inner_rect,
+            combined_outer_screen_rect,
+            combined_inner_screen_rect,
             prev: current_clip_chain,
         }));
 
-        state.combined_outer_clip_bounds = self.combined_clip_outer_bounds;
         state.parent_clip_chain = self.clip_chain_node.clone();
     }
 
     pub fn update_transform(
         &mut self,
         state: &mut TransformUpdateState,
         scene_properties: &SceneProperties,
     ) {
--- a/gfx/webrender/src/clip_scroll_tree.rs
+++ b/gfx/webrender/src/clip_scroll_tree.rs
@@ -9,27 +9,32 @@ use clip::ClipStore;
 use clip_scroll_node::{ClipScrollNode, NodeType, ScrollingState, StickyFrameInfo};
 use gpu_cache::GpuCache;
 use gpu_types::ClipScrollNodeData;
 use internal_types::{FastHashMap, FastHashSet};
 use print_tree::{PrintTree, PrintTreePrinter};
 use render_task::ClipChain;
 use resource_cache::ResourceCache;
 use scene::SceneProperties;
+use util::MaxRect;
 
 pub type ScrollStates = FastHashMap<ClipId, ScrollingState>;
 
 /// An id that identifies coordinate systems in the ClipScrollTree. Each
 /// coordinate system has an id and those ids will be shared when the coordinates
 /// system are the same or are in the same axis-aligned space. This allows
 /// for optimizing mask generation.
 #[derive(Debug, Copy, Clone, PartialEq)]
 pub struct CoordinateSystemId(pub u32);
 
 impl CoordinateSystemId {
+    pub fn root() -> CoordinateSystemId {
+        CoordinateSystemId(0)
+    }
+
     pub fn next(&self) -> CoordinateSystemId {
         let CoordinateSystemId(id) = *self;
         CoordinateSystemId(id + 1)
     }
 }
 
 pub struct ClipScrollTree {
     pub nodes: FastHashMap<ClipId, ClipScrollNode>,
@@ -61,16 +66,17 @@ pub struct ClipScrollTree {
 pub struct TransformUpdateState {
     pub parent_reference_frame_transform: LayerToWorldTransform,
     pub parent_combined_viewport_rect: LayerRect,
     pub parent_accumulated_scroll_offset: LayerVector2D,
     pub nearest_scrolling_ancestor_offset: LayerVector2D,
     pub nearest_scrolling_ancestor_viewport: LayerRect,
     pub parent_clip_chain: ClipChain,
     pub combined_outer_clip_bounds: DeviceIntRect,
+    pub combined_inner_clip_bounds: DeviceIntRect,
 
     /// An id for keeping track of the axis-aligned space of this node. This is used in
     /// order to to track what kinds of clip optimizations can be done for a particular
     /// display list item, since optimizations can usually only be done among
     /// coordinate systems which are relatively axis aligned.
     pub current_coordinate_system_id: CoordinateSystemId,
     pub next_coordinate_system_id: CoordinateSystemId,
 }
@@ -350,16 +356,17 @@ impl ClipScrollTree {
                 0.0,
             ),
             parent_combined_viewport_rect: root_viewport,
             parent_accumulated_scroll_offset: LayerVector2D::zero(),
             nearest_scrolling_ancestor_offset: LayerVector2D::zero(),
             nearest_scrolling_ancestor_viewport: LayerRect::zero(),
             parent_clip_chain: None,
             combined_outer_clip_bounds: *screen_rect,
+            combined_inner_clip_bounds: DeviceIntRect::max_rect(),
             current_coordinate_system_id: CoordinateSystemId(0),
             next_coordinate_system_id: CoordinateSystemId(0).next(),
         };
         self.update_node(
             root_reference_frame_id,
             &mut state,
             device_pixel_ratio,
             clip_store,
--- a/gfx/webrender/src/device.rs
+++ b/gfx/webrender/src/device.rs
@@ -386,17 +386,16 @@ impl ExternalTexture {
 
 pub struct Texture {
     id: gl::GLuint,
     target: gl::GLuint,
     layer_count: i32,
     format: ImageFormat,
     width: u32,
     height: u32,
-
     filter: TextureFilter,
     render_target: Option<RenderTargetInfo>,
     fbo_ids: Vec<FBOId>,
     depth_rb: Option<RBOId>,
 }
 
 impl Texture {
     pub fn get_dimensions(&self) -> DeviceUintSize {
@@ -406,16 +405,20 @@ impl Texture {
     pub fn get_render_target_layer_count(&self) -> usize {
         self.fbo_ids.len()
     }
 
     pub fn get_layer_count(&self) -> i32 {
         self.layer_count
     }
 
+    pub fn get_format(&self) -> ImageFormat {
+        self.format
+    }
+
     pub fn get_bpp(&self) -> u32 {
         match self.format {
             ImageFormat::A8 => 1,
             ImageFormat::RGB8 => 3,
             ImageFormat::BGRA8 => 4,
             ImageFormat::RG8 => 2,
             ImageFormat::RGBAF32 => 16,
             ImageFormat::Invalid => unreachable!(),
@@ -880,37 +883,39 @@ impl Device {
         format: ImageFormat,
         filter: TextureFilter,
         render_target: Option<RenderTargetInfo>,
         layer_count: i32,
         pixels: Option<&[u8]>,
     ) {
         debug_assert!(self.inside_frame);
 
-        let resized = texture.width != width || texture.height != height;
+        let resized = texture.width != width ||
+            texture.height != height ||
+            texture.format != format;
 
         texture.format = format;
         texture.width = width;
         texture.height = height;
         texture.filter = filter;
         texture.layer_count = layer_count;
         texture.render_target = render_target;
 
-        let (internal_format, gl_format) = gl_texture_formats_for_image_format(self.gl(), format);
-        let type_ = gl_type_for_texture_format(format);
-
         self.bind_texture(DEFAULT_TEXTURE, texture);
         self.set_texture_parameters(texture.target, filter);
 
         match render_target {
             Some(info) => {
                 assert!(pixels.is_none());
                 self.update_texture_storage(texture, &info, resized);
             }
             None => {
+                let (internal_format, gl_format) = gl_texture_formats_for_image_format(self.gl(), format);
+                let type_ = gl_type_for_texture_format(format);
+
                 let expanded_data: Vec<u8>;
                 let actual_pixels = if pixels.is_some() && format == ImageFormat::A8 &&
                     cfg!(any(target_arch = "arm", target_arch = "aarch64"))
                 {
                     expanded_data = pixels
                         .unwrap()
                         .iter()
                         .flat_map(|&byte| repeat(byte).take(4))
--- a/gfx/webrender/src/frame_builder.rs
+++ b/gfx/webrender/src/frame_builder.rs
@@ -9,17 +9,17 @@ use api::{DocumentLayer, ExtendMode, Fon
 use api::{GlyphInstance, GlyphOptions, GradientStop, HitTestFlags, HitTestItem, HitTestResult};
 use api::{ImageKey, ImageRendering, ItemRange, ItemTag, LayerPoint, LayerPrimitiveInfo, LayerRect};
 use api::{LayerSize, LayerToScrollTransform, LayerVector2D, LayoutVector2D, LineOrientation};
 use api::{LineStyle, LocalClip, PipelineId, RepeatMode};
 use api::{ScrollSensitivity, Shadow, TileOffset, TransformStyle};
 use api::{PremultipliedColorF, WorldPoint, YuvColorSpace, YuvData};
 use app_units::Au;
 use border::ImageBorderSegment;
-use clip::{ClipRegion, ClipSource, ClipSources, ClipStore, Contains, MAX_CLIP};
+use clip::{ClipRegion, ClipSource, ClipSources, ClipStore, Contains};
 use clip_scroll_node::{ClipScrollNode, NodeType};
 use clip_scroll_tree::ClipScrollTree;
 use euclid::{SideOffsets2D, vec2};
 use frame::FrameId;
 use glyph_rasterizer::FontInstance;
 use gpu_cache::GpuCache;
 use internal_types::{FastHashMap, FastHashSet};
 use picture::{PictureCompositeMode, PictureKind, PicturePrimitive, RasterizationSpace};
@@ -31,17 +31,17 @@ use prim_store::{RectangleContent, Recta
 use profiler::{FrameProfileCounters, GpuCacheProfileCounters, TextureCacheProfileCounters};
 use render_task::{ClearMode, RenderTask, RenderTaskId, RenderTaskTree};
 use resource_cache::ResourceCache;
 use scene::{ScenePipeline, SceneProperties};
 use std::{mem, usize, f32};
 use tiling::{CompositeOps, Frame};
 use tiling::{RenderPass, RenderPassKind, RenderTargetKind};
 use tiling::{RenderTargetContext, ScrollbarPrimitive};
-use util::{self, pack_as_float, RectHelpers, recycle_vec};
+use util::{self, MaxRect, pack_as_float, RectHelpers, recycle_vec};
 
 #[derive(Debug)]
 pub struct ScrollbarInfo(pub ClipId, pub LayerRect);
 
 /// Properties of a stacking context that are maintained
 /// during creation of the scene. These structures are
 /// not persisted after the initial scene build.
 struct StackingContext {
@@ -295,20 +295,17 @@ impl FrameBuilder {
         let current_reference_frame_id = self.current_reference_frame_id();
 
         // An arbitrary large clip rect. For now, we don't
         // specify a clip specific to the stacking context.
         // However, now that they are represented as Picture
         // primitives, we can apply any kind of clip mask
         // to them, as for a normal primitive. This is needed
         // to correctly handle some CSS cases (see #1957).
-        let max_clip = LayerRect::new(
-            LayerPoint::new(-MAX_CLIP, -MAX_CLIP),
-            LayerSize::new(2.0 * MAX_CLIP, 2.0 * MAX_CLIP),
-        );
+        let max_clip = LayerRect::max_rect();
 
         // If there is no root picture, create one for the main framebuffer.
         if self.sc_stack.is_empty() {
             // Should be no pictures at all if the stack is empty...
             debug_assert!(self.prim_store.cpu_pictures.is_empty());
             debug_assert_eq!(transform_style, TransformStyle::Flat);
 
             // This picture stores primitive runs for items on the
@@ -1559,35 +1556,34 @@ impl FrameBuilder {
         let root_prim_context = PrimitiveContext::new(
             device_pixel_ratio,
             display_list,
             root_clip_scroll_node,
             root_clip_scroll_node,
         );
 
         let mut child_tasks = Vec::new();
-
         self.prim_store.reset_prim_visibility();
-
         self.prim_store.prepare_prim_runs(
             &prim_run_cmds,
             root_clip_scroll_node.pipeline_id,
             gpu_cache,
             resource_cache,
             render_tasks,
             &mut self.clip_store,
             clip_scroll_tree,
             pipelines,
             &root_prim_context,
             true,
             &mut child_tasks,
             profile_counters,
             None,
             scene_properties,
             SpecificPrimitiveIndex(0),
+            &self.screen_rect.to_i32(),
         );
 
         let pic = &mut self.prim_store.cpu_pictures[0];
         pic.runs = prim_run_cmds;
 
         let root_render_task = RenderTask::new_picture(
             None,
             PrimitiveIndex(0),
@@ -1658,17 +1654,16 @@ impl FrameBuilder {
         profile_counters
             .total_primitives
             .set(self.prim_store.prim_count());
 
         resource_cache.begin_frame(frame_id);
         gpu_cache.begin_frame();
 
         let mut node_data = Vec::new();
-
         clip_scroll_tree.update_tree(
             &self.screen_rect.to_i32(),
             device_pixel_ratio,
             &mut self.clip_store,
             resource_cache,
             gpu_cache,
             pan,
             &mut node_data,
--- a/gfx/webrender/src/glyph_rasterizer.rs
+++ b/gfx/webrender/src/glyph_rasterizer.rs
@@ -174,33 +174,35 @@ impl FontInstance {
     pub fn get_subpx_offset(&self, glyph: &GlyphKey) -> (f64, f64) {
         match self.subpx_dir {
             SubpixelDirection::None => (0.0, 0.0),
             SubpixelDirection::Horizontal => (glyph.subpixel_offset.into(), 0.0),
             SubpixelDirection::Vertical => (0.0, glyph.subpixel_offset.into()),
         }
     }
 
-    pub fn get_subpixel_glyph_format(&self) -> GlyphFormat {
-        if self.transform.is_identity() { GlyphFormat::Subpixel } else { GlyphFormat::TransformedSubpixel }
-    }
-
-    #[allow(dead_code)]
-    pub fn get_glyph_format(&self) -> GlyphFormat {
+    pub fn get_glyph_format(&self, color_bitmaps: bool) -> GlyphFormat {
         match self.render_mode {
-            FontRenderMode::Mono | FontRenderMode::Alpha => GlyphFormat::Alpha,
-            FontRenderMode::Subpixel => self.get_subpixel_glyph_format(),
-            FontRenderMode::Bitmap => GlyphFormat::ColorBitmap,
+            FontRenderMode::Mono | FontRenderMode::Alpha => {
+                if self.transform.is_identity() { GlyphFormat::Alpha } else { GlyphFormat::TransformedAlpha }
+            }
+            FontRenderMode::Subpixel => {
+                if self.transform.is_identity() { GlyphFormat::Subpixel } else { GlyphFormat::TransformedSubpixel }
+            }
+            FontRenderMode::Bitmap => {
+                if color_bitmaps { GlyphFormat::ColorBitmap } else { GlyphFormat::Alpha }
+            }
         }
     }
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
 pub enum GlyphFormat {
     Alpha,
+    TransformedAlpha,
     Subpixel,
     TransformedSubpixel,
     ColorBitmap,
 }
 
 pub struct RasterizedGlyph {
     pub top: f32,
     pub left: f32,
--- a/gfx/webrender/src/gpu_types.rs
+++ b/gfx/webrender/src/gpu_types.rs
@@ -193,21 +193,25 @@ pub struct ClipScrollNodeIndex(pub u32);
 #[derive(Debug)]
 #[repr(C)]
 pub struct ClipScrollNodeData {
     pub transform: LayerToWorldTransform,
     pub inv_transform: WorldToLayerTransform,
     pub local_clip_rect: LayerRect,
     pub reference_frame_relative_scroll_offset: LayerVector2D,
     pub scroll_offset: LayerVector2D,
+    pub transform_kind: f32,
+    pub padding: [f32; 3],
 }
 
 impl ClipScrollNodeData {
     pub fn invalid() -> ClipScrollNodeData {
         ClipScrollNodeData {
             transform: LayerToWorldTransform::identity(),
             inv_transform: WorldToLayerTransform::identity(),
             local_clip_rect: LayerRect::zero(),
             reference_frame_relative_scroll_offset: LayerVector2D::zero(),
             scroll_offset: LayerVector2D::zero(),
+            transform_kind: 0.0,
+            padding: [0.0; 3],
         }
     }
 }
--- a/gfx/webrender/src/lib.rs
+++ b/gfx/webrender/src/lib.rs
@@ -149,11 +149,11 @@ extern crate time;
 extern crate ws;
 pub extern crate webrender_api;
 
 #[doc(hidden)]
 pub use device::{build_shader_strings, ProgramCache};
 pub use renderer::{CpuProfile, DebugFlags, GpuProfile, OutputImageHandler, RendererKind};
 pub use renderer::{ExternalImage, ExternalImageHandler, ExternalImageSource};
 pub use renderer::{GraphicsApi, GraphicsApiInfo, ReadPixelsFormat, Renderer, RendererOptions};
-pub use renderer::{ThreadListener};
+pub use renderer::{RendererStats, ThreadListener};
 pub use renderer::MAX_VERTEX_TEXTURE_WIDTH;
 pub use webrender_api as api;
--- a/gfx/webrender/src/picture.rs
+++ b/gfx/webrender/src/picture.rs
@@ -35,17 +35,17 @@ pub enum PictureCompositeMode {
     /// Draw to intermediate surface, copy straight across. This
     /// is used for CSS isolation, and plane splitting.
     Blit,
 }
 
 /// Configure whether the primitives on this picture
 /// should be rasterized in screen space or local space.
 #[repr(C)]
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum RasterizationSpace {
     Local = 0,
     Screen = 1,
 }
 
 #[derive(Debug)]
 pub enum PictureKind {
     TextShadow {
--- a/gfx/webrender/src/platform/macos/font.rs
+++ b/gfx/webrender/src/platform/macos/font.rs
@@ -652,13 +652,13 @@ impl FontContext {
         }
 
         Some(RasterizedGlyph {
             left: metrics.rasterized_left as f32,
             top: metrics.rasterized_ascent as f32,
             width: metrics.rasterized_width,
             height: metrics.rasterized_height,
             scale: 1.0,
-            format: font.get_glyph_format(),
+            format: font.get_glyph_format(true),
             bytes: rasterized_pixels,
         })
     }
 }
--- a/gfx/webrender/src/platform/unix/font.rs
+++ b/gfx/webrender/src/platform/unix/font.rs
@@ -14,17 +14,17 @@ use freetype::freetype::{FT_F26Dot6, FT_
 use freetype::freetype::{FT_GlyphSlot, FT_LcdFilter, FT_New_Face, FT_New_Memory_Face};
 use freetype::freetype::{FT_Init_FreeType, FT_Load_Glyph, FT_Render_Glyph};
 use freetype::freetype::{FT_Library, FT_Outline_Get_CBox, FT_Set_Char_Size, FT_Select_Size};
 use freetype::freetype::{FT_Fixed, FT_Matrix, FT_Set_Transform};
 use freetype::freetype::{FT_LOAD_COLOR, FT_LOAD_DEFAULT, FT_LOAD_FORCE_AUTOHINT};
 use freetype::freetype::{FT_LOAD_IGNORE_GLOBAL_ADVANCE_WIDTH, FT_LOAD_NO_AUTOHINT};
 use freetype::freetype::{FT_LOAD_NO_BITMAP, FT_LOAD_NO_HINTING, FT_LOAD_VERTICAL_LAYOUT};
 use freetype::freetype::{FT_FACE_FLAG_SCALABLE, FT_FACE_FLAG_FIXED_SIZES, FT_Err_Cannot_Render_Glyph};
-use glyph_rasterizer::{FontInstance, GlyphFormat, RasterizedGlyph};
+use glyph_rasterizer::{FontInstance, RasterizedGlyph};
 use internal_types::FastHashMap;
 use std::{cmp, mem, ptr, slice};
 use std::cmp::max;
 use std::ffi::CString;
 use std::sync::Arc;
 
 // These constants are not present in the freetype
 // bindings due to bindgen not handling the way
@@ -529,33 +529,29 @@ impl FontContext {
         let pixel_mode = unsafe { mem::transmute(bitmap.pixel_mode as u32) };
         info!(
             "Rasterizing {:?} as {:?} with dimensions {:?}",
             key,
             font.render_mode,
             dimensions
         );
 
-        let (format, actual_width, actual_height) = match pixel_mode {
+        let (actual_width, actual_height) = match pixel_mode {
             FT_Pixel_Mode::FT_PIXEL_MODE_LCD => {
                 assert!(bitmap.width % 3 == 0);
-                (font.get_subpixel_glyph_format(), (bitmap.width / 3) as i32, bitmap.rows as i32)
+                ((bitmap.width / 3) as i32, bitmap.rows as i32)
             }
             FT_Pixel_Mode::FT_PIXEL_MODE_LCD_V => {
                 assert!(bitmap.rows % 3 == 0);
-                (font.get_subpixel_glyph_format(), bitmap.width as i32, (bitmap.rows / 3) as i32)
-            }
-            FT_Pixel_Mode::FT_PIXEL_MODE_MONO => {
-                (GlyphFormat::Alpha, bitmap.width as i32, bitmap.rows as i32)
+                (bitmap.width as i32, (bitmap.rows / 3) as i32)
             }
-            FT_Pixel_Mode::FT_PIXEL_MODE_GRAY => {
-                (GlyphFormat::Alpha, bitmap.width as i32, bitmap.rows as i32)
-            }
+            FT_Pixel_Mode::FT_PIXEL_MODE_MONO |
+            FT_Pixel_Mode::FT_PIXEL_MODE_GRAY |
             FT_Pixel_Mode::FT_PIXEL_MODE_BGRA => {
-                (GlyphFormat::ColorBitmap, bitmap.width as i32, bitmap.rows as i32)
+                (bitmap.width as i32, bitmap.rows as i32)
             }
             _ => panic!("Unsupported {:?}", pixel_mode),
         };
         let (left, top) = unsafe { ((*slot).bitmap_left, (*slot).bitmap_top) };
         let mut final_buffer = vec![0; (actual_width * actual_height * 4) as usize];
 
         // Extract the final glyph from FT format into RGBA8 format, which is
         // what WR expects.
@@ -639,17 +635,17 @@ impl FontContext {
         }
 
         Some(RasterizedGlyph {
             left: (dimensions.left + left) as f32,
             top: (dimensions.top + top - actual_height) as f32,
             width: actual_width as u32,
             height: actual_height as u32,
             scale,
-            format,
+            format: font.get_glyph_format(pixel_mode == FT_Pixel_Mode::FT_PIXEL_MODE_BGRA),
             bytes: final_buffer,
         })
     }
 }
 
 impl Drop for FontContext {
     fn drop(&mut self) {
         unsafe {
--- a/gfx/webrender/src/platform/windows/font.rs
+++ b/gfx/webrender/src/platform/windows/font.rs
@@ -26,51 +26,53 @@ pub struct FontContext {
 }
 
 // DirectWrite is safe to use on multiple threads and non-shareable resources are
 // all hidden inside their font context.
 unsafe impl Send for FontContext {}
 
 fn dwrite_texture_type(render_mode: FontRenderMode) -> dwrote::DWRITE_TEXTURE_TYPE {
     match render_mode {
-        FontRenderMode::Mono | FontRenderMode::Bitmap => dwrote::DWRITE_TEXTURE_ALIASED_1x1,
-        FontRenderMode::Alpha | FontRenderMode::Subpixel => dwrote::DWRITE_TEXTURE_CLEARTYPE_3x1,
+        FontRenderMode::Mono => dwrote::DWRITE_TEXTURE_ALIASED_1x1,
+        FontRenderMode::Bitmap |
+        FontRenderMode::Alpha |
+        FontRenderMode::Subpixel => dwrote::DWRITE_TEXTURE_CLEARTYPE_3x1,
     }
 }
 
 fn dwrite_measure_mode(
     render_mode: FontRenderMode,
     options: Option<FontInstancePlatformOptions>,
 ) -> dwrote::DWRITE_MEASURING_MODE {
-    let FontInstancePlatformOptions { force_gdi_rendering, use_embedded_bitmap, .. } =
+    let FontInstancePlatformOptions { force_gdi_rendering, .. } =
         options.unwrap_or_default();
-    if force_gdi_rendering || use_embedded_bitmap {
-        return dwrote::DWRITE_MEASURING_MODE_GDI_CLASSIC;
-    }
-
-    match render_mode {
-        FontRenderMode::Mono | FontRenderMode::Bitmap => dwrote::DWRITE_MEASURING_MODE_GDI_NATURAL,
-        FontRenderMode::Alpha | FontRenderMode::Subpixel => dwrote::DWRITE_MEASURING_MODE_NATURAL,
+    if force_gdi_rendering {
+        dwrote::DWRITE_MEASURING_MODE_GDI_CLASSIC
+    } else {
+      match render_mode {
+          FontRenderMode::Mono | FontRenderMode::Bitmap => dwrote::DWRITE_MEASURING_MODE_GDI_CLASSIC,
+          FontRenderMode::Alpha | FontRenderMode::Subpixel => dwrote::DWRITE_MEASURING_MODE_NATURAL,
+      }
     }
 }
 
 fn dwrite_render_mode(
     font_face: &dwrote::FontFace,
     render_mode: FontRenderMode,
     em_size: f32,
     measure_mode: dwrote::DWRITE_MEASURING_MODE,
     options: Option<FontInstancePlatformOptions>,
 ) -> dwrote::DWRITE_RENDERING_MODE {
-    let FontInstancePlatformOptions { force_gdi_rendering, use_embedded_bitmap, .. } =
-        options.unwrap_or_default();
-
     let dwrite_render_mode = match render_mode {
-        FontRenderMode::Mono | FontRenderMode::Bitmap => dwrote::DWRITE_RENDERING_MODE_ALIASED,
+        FontRenderMode::Bitmap => dwrote::DWRITE_RENDERING_MODE_GDI_CLASSIC,
+        FontRenderMode::Mono => dwrote::DWRITE_RENDERING_MODE_ALIASED,
         FontRenderMode::Alpha | FontRenderMode::Subpixel => {
-            if force_gdi_rendering || use_embedded_bitmap {
+            let FontInstancePlatformOptions { force_gdi_rendering, .. } =
+                options.unwrap_or_default();
+            if force_gdi_rendering {
                 dwrote::DWRITE_RENDERING_MODE_GDI_CLASSIC
             } else {
                 font_face.get_recommended_rendering_mode_default_params(em_size, 1.0, measure_mode)
             }
         }
     };
 
     if dwrite_render_mode == dwrote::DWRITE_RENDERING_MODE_OUTLINE {
@@ -252,31 +254,28 @@ impl FontContext {
                     advance: advance,
                 }
             })
     }
 
     // DWrite ClearType gives us values in RGB, but WR expects BGRA.
     fn convert_to_bgra(&self, pixels: &[u8], render_mode: FontRenderMode) -> Vec<u8> {
         match render_mode {
-            FontRenderMode::Bitmap => {
-                unreachable!("TODO: bitmap fonts");
-            }
             FontRenderMode::Mono => {
                 let mut bgra_pixels: Vec<u8> = vec![0; pixels.len() * 4];
                 for i in 0 .. pixels.len() {
                     let alpha = pixels[i];
                     bgra_pixels[i * 4 + 0] = alpha;
                     bgra_pixels[i * 4 + 1] = alpha;
                     bgra_pixels[i * 4 + 2] = alpha;
                     bgra_pixels[i * 4 + 3] = alpha;
                 }
                 bgra_pixels
             }
-            FontRenderMode::Alpha => {
+            FontRenderMode::Alpha | FontRenderMode::Bitmap => {
                 let length = pixels.len() / 3;
                 let mut bgra_pixels: Vec<u8> = vec![0; length * 4];
                 for i in 0 .. length {
                     // Only take the G channel, as its closest to D2D
                     let alpha = pixels[i * 3 + 1] as u8;
                     bgra_pixels[i * 4 + 0] = alpha;
                     bgra_pixels[i * 4 + 1] = alpha;
                     bgra_pixels[i * 4 + 2] = alpha;
@@ -293,19 +292,21 @@ impl FontContext {
                     bgra_pixels[i * 4 + 2] = pixels[i * 3 + 0];
                     bgra_pixels[i * 4 + 3] = 0xff;
                 }
                 bgra_pixels
             }
         }
     }
 
-    pub fn is_bitmap_font(&mut self, _font: &FontInstance) -> bool {
-        // TODO(gw): Support bitmap fonts in DWrite.
-        false
+    pub fn is_bitmap_font(&mut self, font: &FontInstance) -> bool {
+        // If bitmaps are requested, then treat as a bitmap font to disable transforms.
+        // If mono AA is requested, let that take priority over using bitmaps.
+        font.render_mode != FontRenderMode::Mono &&
+            font.platform_options.unwrap_or_default().use_embedded_bitmap
     }
 
     pub fn prepare_font(font: &mut FontInstance) {
         match font.render_mode {
             FontRenderMode::Mono | FontRenderMode::Bitmap => {
                 // In mono/bitmap modes the color of the font is irrelevant.
                 font.color = ColorU::new(255, 255, 255, 255);
                 // Subpixel positioning is disabled in mono and bitmap modes.
@@ -336,35 +337,33 @@ impl FontContext {
         // Such as for spaces
         if width == 0 || height == 0 {
             return None;
         }
 
         let pixels = analysis.create_alpha_texture(texture_type, bounds);
         let mut bgra_pixels = self.convert_to_bgra(&pixels, font.render_mode);
 
-        match font.render_mode {
-            FontRenderMode::Mono | FontRenderMode::Bitmap => {}
+        let lut_correction = match font.render_mode {
+            FontRenderMode::Mono | FontRenderMode::Bitmap => &self.gdi_gamma_lut,
             FontRenderMode::Alpha | FontRenderMode::Subpixel => {
-                let lut_correction = match font.platform_options {
-                    Some(option) => if option.force_gdi_rendering {
-                        &self.gdi_gamma_lut
-                    } else {
-                        &self.gamma_lut
-                    },
-                    None => &self.gamma_lut,
-                };
-
-                lut_correction.preblend(&mut bgra_pixels, font.color);
+                let FontInstancePlatformOptions { force_gdi_rendering, .. } =
+                    font.platform_options.unwrap_or_default();
+                if force_gdi_rendering {
+                    &self.gdi_gamma_lut
+                } else {
+                    &self.gamma_lut
+                }
             }
-        }
+        };
+        lut_correction.preblend(&mut bgra_pixels, font.color);
 
         Some(RasterizedGlyph {
             left: bounds.left as f32,
             top: -bounds.top as f32,
             width,
             height,
             scale: 1.0,
-            format: font.get_glyph_format(),
+            format: font.get_glyph_format(false),
             bytes: bgra_pixels,
         })
     }
 }
--- a/gfx/webrender/src/prim_store.rs
+++ b/gfx/webrender/src/prim_store.rs
@@ -4,27 +4,27 @@
 
 use api::{BorderRadius, BuiltDisplayList, ColorF, ComplexClipRegion, DeviceIntRect};
 use api::{DevicePoint, ExtendMode, FontRenderMode, GlyphInstance, GlyphKey};
 use api::{GradientStop, ImageKey, ImageRendering, ItemRange, ItemTag, LayerPoint, LayerRect};
 use api::{ClipMode, LayerSize, LayerVector2D, LayerToWorldTransform, LineOrientation, LineStyle};
 use api::{ClipAndScrollInfo, EdgeAaSegmentMask, PremultipliedColorF, TileOffset};
 use api::{ClipId, LayerTransform, PipelineId, YuvColorSpace, YuvFormat};
 use border::BorderCornerInstance;
-use clip_scroll_tree::ClipScrollTree;
+use clip_scroll_tree::{CoordinateSystemId, ClipScrollTree};
 use clip::{ClipSourcesHandle, ClipStore};
 use frame_builder::PrimitiveContext;
 use glyph_rasterizer::{FontInstance, FontTransform};
 use internal_types::FastHashMap;
 use gpu_cache::{GpuBlockData, GpuCache, GpuCacheAddress, GpuCacheHandle, GpuDataRequest,
                 ToGpuBlocks};
 use picture::{PictureKind, PicturePrimitive, RasterizationSpace};
 use profiler::FrameProfileCounters;
-use render_task::{ClipWorkItem, ClipChainNode};
-use render_task::{RenderTask, RenderTaskId, RenderTaskTree};
+use render_task::{ClipChainNode, ClipChainNodeIter, ClipWorkItem, RenderTask, RenderTaskId};
+use render_task::RenderTaskTree;
 use renderer::MAX_VERTEX_TEXTURE_WIDTH;
 use resource_cache::{ImageProperties, ResourceCache};
 use scene::{ScenePipeline, SceneProperties};
 use std::{mem, usize};
 use std::rc::Rc;
 use util::{pack_as_float, recycle_vec, MatrixHelpers, TransformedRect, TransformedRectKind};
 
 #[derive(Debug)]
@@ -594,25 +594,23 @@ impl TextRunPrimitiveCpu {
     pub fn get_font(
         &self,
         device_pixel_ratio: f32,
         transform: &LayerToWorldTransform,
         rasterization_kind: RasterizationSpace,
     ) -> FontInstance {
         let mut font = self.font.clone();
         font.size = font.size.scale_by(device_pixel_ratio);
-        match (font.render_mode, rasterization_kind) {
-            (FontRenderMode::Subpixel, RasterizationSpace::Screen) => {
-                if transform.has_perspective_component() || !transform.has_2d_inverse() {
-                    font.render_mode = FontRenderMode::Alpha;
-                } else {
-                    font.transform = FontTransform::from(transform).quantize();
-                }
+        if font.render_mode != FontRenderMode::Bitmap &&
+           rasterization_kind == RasterizationSpace::Screen {
+            if transform.has_perspective_component() || !transform.has_2d_inverse() {
+                font.render_mode = font.render_mode.limit_by(FontRenderMode::Alpha);
+            } else {
+                font.transform = FontTransform::from(transform).quantize();
             }
-            _ => {}
         }
         font
     }
 
     fn prepare_for_render(
         &mut self,
         resource_cache: &mut ResourceCache,
         device_pixel_ratio: f32,
@@ -1230,83 +1228,139 @@ impl PrimitiveStore {
             }
         }
     }
 
     fn update_clip_task(
         &mut self,
         prim_index: PrimitiveIndex,
         prim_context: &PrimitiveContext,
-        prim_screen_rect: DeviceIntRect,
+        prim_screen_rect: &DeviceIntRect,
+        screen_rect: &DeviceIntRect,
         resource_cache: &mut ResourceCache,
         gpu_cache: &mut GpuCache,
         render_tasks: &mut RenderTaskTree,
         clip_store: &mut ClipStore,
         tasks: &mut Vec<RenderTaskId>,
     ) -> bool {
         let metadata = &mut self.cpu_metadata[prim_index.0];
         metadata.clip_task_id = None;
-        let transform = &prim_context.scroll_node.world_content_transform;
 
-        clip_store.get_mut(&metadata.clip_sources).update(
-            transform,
-            gpu_cache,
-            resource_cache,
-            prim_context.device_pixel_ratio,
-        );
-
-        // Try to create a mask if we may need to.
-        let prim_clips = clip_store.get(&metadata.clip_sources);
-        let is_axis_aligned = transform.transform_kind() == TransformedRectKind::AxisAligned;
+        let prim_screen_rect = match prim_screen_rect.intersection(screen_rect) {
+            Some(rect) => rect,
+            None => {
+                metadata.screen_rect = None;
+                return false;
+            }
+        };
 
-        let has_clips = prim_context.clip_node.clip_chain_node.is_some() || prim_clips.has_clips();
-        let clip_task = if has_clips {
-            // Take into account the actual clip info of the primitive, and
-            // mutate the current bounds accordingly.
-            let mask_rect = match prim_clips.bounds.outer {
-                Some(ref outer) => match prim_screen_rect.intersection(&outer.device_rect) {
-                    Some(rect) => rect,
-                    None => {
-                        metadata.screen_rect = None;
-                        return false;
-                    }
-                },
-                _ => prim_screen_rect,
-            };
+        let clip_chain = prim_context.clip_node.clip_chain_node.clone();
+        let mut combined_outer_rect = match clip_chain {
+            Some(ref node) => prim_screen_rect.intersection(&node.combined_outer_screen_rect),
+            None => Some(prim_screen_rect),
+        };
 
-            let extra_clip = if prim_clips.has_clips() {
+        let prim_coordinate_system_id = prim_context.scroll_node.coordinate_system_id;
+        let transform = &prim_context.scroll_node.world_content_transform;
+        let extra_clip =  {
+            let prim_clips = clip_store.get_mut(&metadata.clip_sources);
+            if prim_clips.has_clips() {
+                prim_clips.update(gpu_cache, resource_cache);
+                let (screen_inner_rect, screen_outer_rect) =
+                    prim_clips.get_screen_bounds(transform, prim_context.device_pixel_ratio);
+
+                if let Some(outer) = screen_outer_rect {
+                    combined_outer_rect = combined_outer_rect.and_then(|r| r.intersection(&outer));
+                }
+
                 Some(Rc::new(ClipChainNode {
                     work_item: ClipWorkItem {
                         scroll_node_data_index: prim_context.scroll_node.node_data_index,
                         clip_sources: metadata.clip_sources.weak(),
-                        coordinate_system_id: prim_context.scroll_node.coordinate_system_id,
+                        coordinate_system_id: prim_coordinate_system_id,
                     },
+                    screen_inner_rect,
+                    combined_outer_screen_rect:
+                        combined_outer_rect.unwrap_or_else(DeviceIntRect::zero),
+                    combined_inner_screen_rect: DeviceIntRect::zero(),
                     prev: None,
                 }))
             } else {
                 None
-            };
+            }
+        };
+
+        // If everything is clipped out, then we don't need to render this primitive.
+        let combined_outer_rect = match combined_outer_rect {
+            Some(rect) if !rect.is_empty() => rect,
+            _ => {
+                metadata.screen_rect = None;
+                return false;
+            }
+        };
+
+        // Filter out all the clip instances that don't contribute to the result.
+        let mut combined_inner_rect = *screen_rect;
+        let clips: Vec<_> = ClipChainNodeIter { current: extra_clip }
+            .chain(ClipChainNodeIter { current: clip_chain })
+            .take_while(|node| {
+                !node.combined_inner_screen_rect.contains_rect(&combined_outer_rect)
+            })
+            .filter_map(|node| {
+                combined_inner_rect = if !node.screen_inner_rect.is_empty() {
+                    // If this clip's inner area contains the area of the primitive clipped
+                    // by previous clips, then it's not going to affect rendering in any way.
+                    if node.screen_inner_rect.contains_rect(&combined_outer_rect) {
+                        return None;
+                    }
+                    combined_inner_rect.intersection(&node.screen_inner_rect)
+                        .unwrap_or_else(DeviceIntRect::zero)
+                } else {
+                    DeviceIntRect::zero()
+                };
 
-            RenderTask::new_mask(
-                None,
-                mask_rect,
-                prim_context.clip_node.clip_chain_node.clone(),
-                extra_clip,
-                prim_screen_rect,
-                clip_store,
-                is_axis_aligned,
-                prim_context.scroll_node.coordinate_system_id,
-            )
-        } else {
-            None
-        };
+                Some(node.work_item.clone())
+            })
+            .collect();
+
+        if clips.is_empty() {
+            // If this item is in the root coordinate system, then
+            // we know that the local_clip_rect in the clip node
+            // will take care of applying this clip, so no need
+            // for a mask.
+            if prim_coordinate_system_id == CoordinateSystemId::root() {
+                return true;
+            }
+
+            // If we have filtered all clips and the screen rect isn't any smaller, we can just
+            // skip masking entirely.
+            if combined_outer_rect == prim_screen_rect {
+                return true;
+            }
+            // Otherwise we create an empty mask, but with an empty inner rect to avoid further
+            // optimization of the empty mask.
+            combined_inner_rect = DeviceIntRect::zero();
+        }
+
+        if combined_inner_rect.contains_rect(&prim_screen_rect) {
+           return true;
+        }
+
+        let clip_task = RenderTask::new_mask(
+            None,
+            combined_outer_rect,
+            combined_inner_rect,
+            clips,
+            clip_store,
+            transform.transform_kind() == TransformedRectKind::AxisAligned,
+            prim_coordinate_system_id,
+        );
 
         if let Some(clip_task) = clip_task {
             let clip_task_id = render_tasks.add(clip_task);
-
             metadata.clip_task_id = Some(clip_task_id);
             tasks.push(clip_task_id);
         }
 
         true
     }
 
     pub fn prepare_prim_for_render(
@@ -1319,50 +1373,57 @@ impl PrimitiveStore {
         clip_store: &mut ClipStore,
         clip_scroll_tree: &ClipScrollTree,
         pipelines: &FastHashMap<PipelineId, ScenePipeline>,
         perform_culling: bool,
         parent_tasks: &mut Vec<RenderTaskId>,
         scene_properties: &SceneProperties,
         profile_counters: &mut FrameProfileCounters,
         pic_index: SpecificPrimitiveIndex,
+        screen_rect: &DeviceIntRect,
     ) -> Option<LayerRect> {
         // Reset the visibility of this primitive.
         // Do some basic checks first, that can early out
         // without even knowing the local rect.
-        let (cpu_prim_index, dependencies, cull_children) = {
+        let (cpu_prim_index, dependencies, cull_children, may_need_clip_mask) = {
             let metadata = &mut self.cpu_metadata[prim_index.0];
             metadata.screen_rect = None;
 
             if perform_culling &&
                !metadata.is_backface_visible &&
                prim_context.scroll_node.world_content_transform.is_backface_visible() {
                 return None;
             }
 
-            let (dependencies, cull_children) = match metadata.prim_kind {
+            let (dependencies, cull_children, may_need_clip_mask) = match metadata.prim_kind {
                 PrimitiveKind::Picture => {
                     let pic = &mut self.cpu_pictures[metadata.cpu_prim_index.0];
 
                     if !pic.resolve_scene_properties(scene_properties) {
                         return None;
                     }
 
-                    let rfid = match pic.kind {
-                        PictureKind::Image { reference_frame_id, .. } => Some(reference_frame_id),
-                        _ => None,
+                    let (rfid, may_need_clip_mask) = match pic.kind {
+                        PictureKind::Image { reference_frame_id, .. } => {
+                            (Some(reference_frame_id), false)
+                        }
+                        _ => {
+                            (None, true)
+                        }
                     };
-                    (Some((pic.pipeline_id, mem::replace(&mut pic.runs, Vec::new()), rfid)), pic.cull_children)
+                    (Some((pic.pipeline_id, mem::replace(&mut pic.runs, Vec::new()), rfid)),
+                     pic.cull_children,
+                     may_need_clip_mask)
                 }
                 _ => {
-                    (None, true)
+                    (None, true, true)
                 }
             };
 
-            (metadata.cpu_prim_index, dependencies, cull_children)
+            (metadata.cpu_prim_index, dependencies, cull_children, may_need_clip_mask)
         };
 
         // If we have dependencies, we need to prepare them first, in order
         // to know the actual rect of this primitive.
         // For example, scrolling may affect the location of an item in
         // local space, which may force us to render this item on a larger
         // picture target, if being composited.
         let mut child_tasks = Vec::new();
@@ -1378,82 +1439,73 @@ impl PrimitiveStore {
                 pipelines,
                 prim_context,
                 cull_children,
                 &mut child_tasks,
                 profile_counters,
                 rfid,
                 scene_properties,
                 cpu_prim_index,
+                screen_rect,
             );
 
             let metadata = &mut self.cpu_metadata[prim_index.0];
 
             // Restore the dependencies (borrow check dance)
             let pic = &mut self.cpu_pictures[cpu_prim_index.0];
             pic.runs = dependencies;
 
             metadata.local_rect = pic.update_local_rect(
                 metadata.local_rect,
                 result,
             );
         }
 
-        let (local_rect, device_rect) = {
+        let (local_rect, unclipped_device_rect) = {
             let metadata = &mut self.cpu_metadata[prim_index.0];
             if metadata.local_rect.size.width <= 0.0 ||
                metadata.local_rect.size.height <= 0.0 {
                 warn!("invalid primitive rect {:?}", metadata.local_rect);
                 return None;
             }
 
-            let local_rect = metadata
-                .local_rect
-                .intersection(&metadata.local_clip_rect);
-
+            let local_rect = metadata.local_rect.intersection(&metadata.local_clip_rect);
             let local_rect = match local_rect {
                 Some(local_rect) => local_rect,
                 None if perform_culling => return None,
                 None => LayerRect::zero(),
             };
 
             let xf_rect = TransformedRect::new(
                 &local_rect,
                 &prim_context.scroll_node.world_content_transform,
                 prim_context.device_pixel_ratio
             );
 
             let clip_bounds = &prim_context.clip_node.combined_clip_outer_bounds;
-            metadata.screen_rect = xf_rect.bounding_rect
-                                          .intersection(clip_bounds);
+            metadata.screen_rect = xf_rect.bounding_rect.intersection(clip_bounds);
 
-            let device_rect = match metadata.screen_rect {
-                Some(device_rect) => device_rect,
-                None => {
-                    if perform_culling {
-                        return None
-                    } else {
-                        DeviceIntRect::zero()
-                    }
-                }
-            };
+            if metadata.screen_rect.is_none() && perform_culling {
+                return None;
+            }
 
-            (local_rect, device_rect)
+            (local_rect, xf_rect.bounding_rect)
         };
 
-        if !self.update_clip_task(
+        if perform_culling && may_need_clip_mask && !self.update_clip_task(
             prim_index,
             prim_context,
-            device_rect,
+            &unclipped_device_rect,
+            screen_rect,
             resource_cache,
             gpu_cache,
             render_tasks,
             clip_store,
             parent_tasks,
-        ) && perform_culling {
+        ) {
             return None;
         }
 
         self.prepare_prim_for_render_inner(
             prim_index,
             prim_context,
             resource_cache,
             gpu_cache,
@@ -1486,16 +1538,17 @@ impl PrimitiveStore {
         pipelines: &FastHashMap<PipelineId, ScenePipeline>,
         parent_prim_context: &PrimitiveContext,
         perform_culling: bool,
         parent_tasks: &mut Vec<RenderTaskId>,
         profile_counters: &mut FrameProfileCounters,
         original_reference_frame_id: Option<ClipId>,
         scene_properties: &SceneProperties,
         pic_index: SpecificPrimitiveIndex,
+        screen_rect: &DeviceIntRect,
     ) -> PrimitiveRunLocalRect {
         let mut result = PrimitiveRunLocalRect {
             local_rect_in_actual_parent_space: LayerRect::zero(),
             local_rect_in_original_parent_space: LayerRect::zero(),
         };
 
         for run in runs {
             // TODO(gw): Perhaps we can restructure this to not need to create
@@ -1552,16 +1605,17 @@ impl PrimitiveStore {
                     clip_store,
                     clip_scroll_tree,
                     pipelines,
                     perform_culling,
                     parent_tasks,
                     scene_properties,
                     profile_counters,
                     pic_index,
+                    screen_rect,
                 ) {
                     profile_counters.visible_primitives.inc();
 
                     if let Some(ref matrix) = original_relative_transform {
                         let bounds = get_local_bounding_rect(&prim_local_rect, matrix);
                         result.local_rect_in_original_parent_space =
                             result.local_rect_in_original_parent_space.union(&bounds);
                     }
--- a/gfx/webrender/src/profiler.rs
+++ b/gfx/webrender/src/profiler.rs
@@ -528,17 +528,19 @@ impl ProfileGraph {
         &self,
         x: f32,
         y: f32,
         description: &'static str,
         debug_renderer: &mut DebugRenderer,
     ) -> Rect<f32> {
         let size = Size2D::new(600.0, 120.0);
         let line_height = debug_renderer.line_height();
-        let mut rect = Rect::new(Point2D::new(x, y), size);
+        let graph_rect = Rect::new(Point2D::new(x, y), size);
+        let mut rect = graph_rect.inflate(10.0, 10.0);
+
         let stats = self.stats();
 
         let text_color = ColorU::new(255, 255, 0, 255);
         let text_origin = rect.origin + vec2(rect.size.width, 20.0);
         debug_renderer.add_text(
             text_origin.x,
             text_origin.y,
             description,
@@ -568,23 +570,21 @@ impl ProfileGraph {
             rect.origin.x,
             rect.origin.y,
             rect.origin.x + rect.size.width + 10.0,
             rect.origin.y + rect.size.height,
             ColorU::new(25, 25, 25, 200),
             ColorU::new(51, 51, 51, 200),
         );
 
-        let bx0 = x + 10.0;
-        let by0 = y + 10.0;
-        let bx1 = bx0 + size.width - 20.0;
-        let by1 = by0 + size.height - 20.0;
+        let bx1 = graph_rect.max_x();
+        let by1 = graph_rect.max_y();
 
-        let w = (bx1 - bx0) / self.max_samples as f32;
-        let h = by1 - by0;
+        let w = graph_rect.size.width / self.max_samples as f32;
+        let h = graph_rect.size.height;
 
         let color_t0 = ColorU::new(0, 255, 0, 255);
         let color_b0 = ColorU::new(0, 180, 0, 255);
 
         let color_t1 = ColorU::new(0, 255, 0, 255);
         let color_b1 = ColorU::new(0, 180, 0, 255);
 
         let color_t2 = ColorU::new(255, 0, 0, 255);
@@ -637,24 +637,21 @@ impl GpuFrameCollection {
             total_time,
             samples,
         });
     }
 }
 
 impl GpuFrameCollection {
     fn draw(&self, x: f32, y: f32, debug_renderer: &mut DebugRenderer) -> Rect<f32> {
-        let bounding_rect = Rect::new(
+        let graph_rect = Rect::new(
             Point2D::new(x, y),
-            Size2D::new(
-                GRAPH_WIDTH + 2.0 * GRAPH_PADDING,
-                GRAPH_HEIGHT + 2.0 * GRAPH_PADDING,
-            ),
+            Size2D::new(GRAPH_WIDTH, GRAPH_HEIGHT),
         );
-        let graph_rect = bounding_rect.inflate(-GRAPH_PADDING, -GRAPH_PADDING);
+        let bounding_rect = graph_rect.inflate(GRAPH_PADDING, GRAPH_PADDING);
 
         debug_renderer.add_quad(
             bounding_rect.origin.x,
             bounding_rect.origin.y,
             bounding_rect.origin.x + bounding_rect.size.width,
             bounding_rect.origin.y + bounding_rect.size.height,
             ColorU::new(25, 25, 25, 200),
             ColorU::new(51, 51, 51, 200),
@@ -835,17 +832,17 @@ impl Profiler {
         renderer_profile: &RendererProfileCounters,
         renderer_timers: &mut RendererProfileTimers,
         gpu_samplers: &[GpuSampler<GpuProfileTag>],
         screen_fraction: f32,
         debug_renderer: &mut DebugRenderer,
     ) {
         self.x_left = 20.0;
         self.y_left = 40.0;
-        self.x_right = 400.0;
+        self.x_right = 450.0;
         self.y_right = 40.0;
 
         let mut gpu_time = 0;
         let gpu_timers = mem::replace(&mut renderer_timers.gpu_samples, Vec::new());
         for sample in &gpu_timers {
             gpu_time += sample.time_ns;
         }
         renderer_timers.gpu_time.set(gpu_time);
@@ -951,29 +948,29 @@ impl Profiler {
         self.ipc_time
             .push(backend_profile.ipc.total_time.nanoseconds);
         self.gpu_time.push(gpu_time);
         self.gpu_frames.push(gpu_time, gpu_timers);
 
 
         let rect =
             self.backend_time
-                .draw_graph(self.x_left, self.y_left, "CPU (backend)", debug_renderer);
-        self.y_left += rect.size.height + PROFILE_PADDING;
+                .draw_graph(self.x_right, self.y_right, "CPU (backend)", debug_renderer);
+        self.y_right += rect.size.height + PROFILE_PADDING;
         let rect = self.compositor_time.draw_graph(
-            self.x_left,
-            self.y_left,
+            self.x_right,
+            self.y_right,
             "CPU (compositor)",
             debug_renderer,
         );
-        self.y_left += rect.size.height + PROFILE_PADDING;
+        self.y_right += rect.size.height + PROFILE_PADDING;
         let rect =
             self.ipc_time
-                .draw_graph(self.x_left, self.y_left, "DisplayList IPC", debug_renderer);
-        self.y_left += rect.size.height + PROFILE_PADDING;
+                .draw_graph(self.x_right, self.y_right, "DisplayList IPC", debug_renderer);
+        self.y_right += rect.size.height + PROFILE_PADDING;
         let rect = self.gpu_time
-            .draw_graph(self.x_left, self.y_left, "GPU", debug_renderer);
-        self.y_left += rect.size.height + PROFILE_PADDING;
+            .draw_graph(self.x_right, self.y_right, "GPU", debug_renderer);
+        self.y_right += rect.size.height + PROFILE_PADDING;
         let rect = self.gpu_frames
-            .draw(self.x_left, self.y_left, debug_renderer);
-        self.y_left += rect.size.height + PROFILE_PADDING;
+            .draw(self.x_left, f32::max(self.y_left, self.y_right), debug_renderer);
+        self.y_right += rect.size.height + PROFILE_PADDING;
     }
 }
--- a/gfx/webrender/src/render_backend.rs
+++ b/gfx/webrender/src/render_backend.rs
@@ -23,17 +23,16 @@ use resource_cache::ResourceCache;
 use scene::Scene;
 #[cfg(feature = "debugger")]
 use serde_json;
 use std::sync::atomic::{ATOMIC_USIZE_INIT, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::sync::mpsc::Sender;
 use std::u32;
 use texture_cache::TextureCache;
-use thread_profiler::register_thread_with_profiler;
 use time::precise_time_ns;
 
 struct Document {
     scene: Scene,
     frame_ctx: FrameContext,
     // the `Option` here is only to deal with borrow checker
     frame_builder: Option<FrameBuilder>,
     window_size: DeviceUintSize,
@@ -179,18 +178,16 @@ impl RenderBackend {
         blob_image_renderer: Option<Box<BlobImageRenderer>>,
         enable_render_on_scroll: bool,
     ) -> RenderBackend {
         // The namespace_id should start from 1.
         NEXT_NAMESPACE_ID.fetch_add(1, Ordering::Relaxed);
 
         let resource_cache = ResourceCache::new(texture_cache, workers, blob_image_renderer);
 
-        register_thread_with_profiler("Backend".to_string());
-
         RenderBackend {
             api_rx,
             payload_rx,
             payload_tx,
             result_tx,
             default_device_pixel_ratio,
             resource_cache,
             gpu_cache: GpuCache::new(),
--- a/gfx/webrender/src/render_task.rs
+++ b/gfx/webrender/src/render_task.rs
@@ -31,21 +31,24 @@ pub struct RenderTaskTree {
     pub task_data: Vec<RenderTaskData>,
 }
 
 pub type ClipChain = Option<Rc<ClipChainNode>>;
 
 #[derive(Debug)]
 pub struct ClipChainNode {
     pub work_item: ClipWorkItem,
+    pub screen_inner_rect: DeviceIntRect,
+    pub combined_outer_screen_rect: DeviceIntRect,
+    pub combined_inner_screen_rect: DeviceIntRect,
     pub prev: ClipChain,
 }
 
-struct ClipChainNodeIter {
-    current: ClipChain,
+pub struct ClipChainNodeIter {
+    pub current: ClipChain,
 }
 
 impl Iterator for ClipChainNodeIter {
     type Item = Rc<ClipChainNode>;
 
     fn next(&mut self) -> ClipChain {
         let previous = self.current.clone();
         self.current = match self.current {
@@ -321,89 +324,43 @@ impl RenderTask {
             location: RenderTaskLocation::Dynamic(None, screen_rect.size),
             kind: RenderTaskKind::Readback(screen_rect),
             clear_mode: ClearMode::Transparent,
         }
     }
 
     pub fn new_mask(
         key: Option<ClipId>,
-        task_rect: DeviceIntRect,
-        raw_clips: ClipChain,
-        extra_clip: ClipChain,
-        prim_rect: DeviceIntRect,
+        outer_rect: DeviceIntRect,
+        inner_rect: DeviceIntRect,
+        clips: Vec<ClipWorkItem>,
         clip_store: &ClipStore,
         is_axis_aligned: bool,
         prim_coordinate_system_id: CoordinateSystemId,
-    ) -> Option<Self> {
-        // Filter out all the clip instances that don't contribute to the result
-        let mut current_coordinate_system_id = prim_coordinate_system_id;
-        let mut inner_rect = Some(task_rect);
-        let clips: Vec<_> = ClipChainNodeIter { current: raw_clips }
-            .chain(ClipChainNodeIter { current: extra_clip })
-            .filter_map(|node| {
-                let work_item = node.work_item.clone();
-
-                // FIXME(1828): This is a workaround until we can fix the inconsistency between
-                // the shader and the CPU code around how inner_rects are handled.
-                if !node.work_item.has_compatible_coordinate_system(current_coordinate_system_id) {
-                    current_coordinate_system_id = node.work_item.coordinate_system_id;
-                    inner_rect = None;
-                    return Some(work_item)
-                }
-
-                let clip_info = clip_store
-                    .get_opt(&node.work_item.clip_sources)
-                    .expect("bug: clip item should exist");
-                debug_assert!(clip_info.has_clips());
-
-                match clip_info.bounds.inner {
-                    Some(ref inner) if !inner.device_rect.is_empty() => {
-                        inner_rect = inner_rect.and_then(|r| r.intersection(&inner.device_rect));
-                        if inner.device_rect.contains_rect(&task_rect) {
-                            return None;
-                        }
-                    }
-                    _ => inner_rect = None,
-                }
-
-                Some(work_item)
-            })
-            .collect();
-
-        // Nothing to do, all clips are irrelevant for this case
-        if clips.is_empty() {
-            return None;
-        }
-
-
+    ) -> Option<RenderTask> {
         // TODO(gw): This optimization is very conservative for now.
         //           For now, only draw optimized geometry if it is
         //           a single aligned rect mask with rounded corners.
         //           In the future, we'll expand this to handle the
         //           more complex types of clip mask geometry.
-        let mut geometry_kind = MaskGeometryKind::Default;
-        if let Some(inner_rect) = inner_rect {
-            // If the inner rect completely contains the primitive
-            // rect, then this mask can't affect the primitive.
-            if inner_rect.contains_rect(&prim_rect) {
-                return None;
-            }
-            if is_axis_aligned && clips.len() == 1 {
-                geometry_kind = clips[0].get_geometry_kind(clip_store, prim_coordinate_system_id);
-            }
-        }
+        let geometry_kind = if is_axis_aligned &&
+            clips.len() == 1 &&
+            inner_rect.size != DeviceIntSize::zero() {
+            clips[0].get_geometry_kind(clip_store, prim_coordinate_system_id)
+        } else {
+            MaskGeometryKind::Default
+        };
 
         Some(RenderTask {
             cache_key: key.map(RenderTaskKey::CacheMask),
             children: Vec::new(),
-            location: RenderTaskLocation::Dynamic(None, task_rect.size),
+            location: RenderTaskLocation::Dynamic(None, outer_rect.size),
             kind: RenderTaskKind::CacheMask(CacheMaskTask {
-                actual_rect: task_rect,
-                inner_rect: inner_rect.unwrap_or(DeviceIntRect::zero()),
+                actual_rect: outer_rect,
+                inner_rect: inner_rect,
                 clips,
                 geometry_kind,
                 coordinate_system_id: prim_coordinate_system_id,
             }),
             clear_mode: ClearMode::One,
         })
     }
 
--- a/gfx/webrender/src/renderer.rs
+++ b/gfx/webrender/src/renderer.rs
@@ -87,17 +87,17 @@ const GPU_TAG_CACHE_TEXT_RUN: GpuProfile
     label: "C_TextRun",
     color: debug_colors::MISTYROSE,
 };
 const GPU_TAG_CACHE_LINE: GpuProfileTag = GpuProfileTag {
     label: "C_Line",
     color: debug_colors::BROWN,
 };
 const GPU_TAG_SETUP_TARGET: GpuProfileTag = GpuProfileTag {
-    label: "target",
+    label: "target init",
     color: debug_colors::SLATEGREY,
 };
 const GPU_TAG_SETUP_DATA: GpuProfileTag = GpuProfileTag {
     label: "data init",
     color: debug_colors::LIGHTGREY,
 };
 const GPU_TAG_PRIM_RECT: GpuProfileTag = GpuProfileTag {
     label: "Rect",
@@ -277,17 +277,17 @@ impl Into<ShaderMode> for TextShaderMode
     fn into(self) -> i32 {
         self as i32
     }
 }
 
 impl From<GlyphFormat> for TextShaderMode {
     fn from(format: GlyphFormat) -> TextShaderMode {
         match format {
-            GlyphFormat::Alpha => TextShaderMode::Alpha,
+            GlyphFormat::Alpha | GlyphFormat::TransformedAlpha => TextShaderMode::Alpha,
             GlyphFormat::Subpixel | GlyphFormat::TransformedSubpixel => {
                 panic!("Subpixel glyph formats must be handled separately.");
             }
             GlyphFormat::ColorBitmap => TextShaderMode::ColorBitmap,
         }
     }
 }
 
@@ -595,22 +595,21 @@ impl SourceTextureResolver {
             device.delete_texture(texture);
         }
     }
 
     fn end_pass(
         &mut self,
         a8_texture: Option<Texture>,
         rgba8_texture: Option<Texture>,
-        a8_pool: &mut Vec<Texture>,
-        rgba8_pool: &mut Vec<Texture>,
+        pool: &mut Vec<Texture>,
     ) {
         // If we have cache textures from previous pass, return them to the pool.
-        rgba8_pool.extend(self.cache_rgba8_texture.take());
-        a8_pool.extend(self.cache_a8_texture.take());
+        pool.extend(self.cache_rgba8_texture.take());
+        pool.extend(self.cache_a8_texture.take());
 
         // We have another pass to process, make these textures available
         // as inputs to the next pass.
         self.cache_rgba8_texture = rgba8_texture;
         self.cache_a8_texture = a8_texture;
     }
 
     // Bind a source texture to the device.
@@ -1187,16 +1186,17 @@ impl TextShader {
                     TransformedRectKind::AxisAligned => {
                         self.simple.bind(device, projection, mode, renderer_errors)
                     }
                     TransformedRectKind::Complex => {
                         self.transform.bind(device, projection, mode, renderer_errors)
                     }
                 }
             }
+            GlyphFormat::TransformedAlpha |
             GlyphFormat::TransformedSubpixel => {
                 self.glyph_transform.bind(device, projection, mode, renderer_errors)
             }
         }
     }
 
     fn deinit(self, device: &mut Device) {
         self.simple.deinit(device);
@@ -1296,16 +1296,23 @@ pub enum ReadPixelsFormat {
     Bgra8,
 }
 
 struct FrameOutput {
     last_access: FrameId,
     fbo_id: FBOId,
 }
 
+#[derive(PartialEq)]
+struct TargetSelector {
+    size: DeviceUintSize,
+    num_layers: usize,
+    format: ImageFormat,
+}
+
 /// The renderer is responsible for submitting to the GPU the work prepared by the
 /// RenderBackend.
 pub struct Renderer {
     result_rx: Receiver<ResultMsg>,
     debug_server: DebugServer,
     device: Device,
     pending_texture_updates: Vec<TextureUpdateList>,
     pending_gpu_cache_updates: Vec<GpuCacheUpdateList>,
@@ -1365,18 +1372,17 @@ pub struct Renderer {
     enable_clear_scissor: bool,
     debug: DebugRenderer,
     debug_flags: DebugFlags,
     backend_profile_counters: BackendProfileCounters,
     profile_counters: RendererProfileCounters,
     profiler: Profiler,
     last_time: u64,
 
-    color_render_targets: Vec<Texture>,
-    alpha_render_targets: Vec<Texture>,
+    render_target_pool: Vec<Texture>,
 
     gpu_profile: GpuProfiler<GpuProfileTag>,
     prim_vao: VAO,
     blur_vao: VAO,
     clip_vao: VAO,
 
     node_data_texture: VertexDataTexture,
     render_task_texture: VertexDataTexture,
@@ -2006,18 +2012,17 @@ impl Renderer {
             backend_profile_counters: BackendProfileCounters::new(),
             profile_counters: RendererProfileCounters::new(),
             profiler: Profiler::new(),
             max_texture_size: max_texture_size,
             max_recorded_profiles: options.max_recorded_profiles,
             clear_color: options.clear_color,
             enable_clear_scissor: options.enable_clear_scissor,
             last_time: 0,
-            color_render_targets: Vec::new(),
-            alpha_render_targets: Vec::new(),
+            render_target_pool: Vec::new(),
             gpu_profile,
             prim_vao,
             blur_vao,
             clip_vao,
             node_data_texture,
             render_task_texture,
             pipeline_epoch_map: FastHashMap::default(),
             dither_matrix_texture,
@@ -2314,24 +2319,28 @@ impl Renderer {
         let gpu_profiles = self.gpu_profiles.drain(..).collect();
         (cpu_profiles, gpu_profiles)
     }
 
     /// Renders the current frame.
     ///
     /// A Frame is supplied by calling [`generate_frame()`][genframe].
     /// [genframe]: ../../webrender_api/struct.DocumentApi.html#method.generate_frame
-    pub fn render(&mut self, framebuffer_size: DeviceUintSize) -> Result<(), Vec<RendererError>> {
+    pub fn render(
+        &mut self,
+        framebuffer_size: DeviceUintSize
+    ) -> Result<RendererStats, Vec<RendererError>> {
         profile_scope!("render");
 
         if self.active_documents.is_empty() {
             self.last_time = precise_time_ns();
-            return Ok(())
+            return Ok(RendererStats::empty());
         }
 
+        let mut stats = RendererStats::empty();
         let mut frame_profiles = Vec::new();
         let mut profile_timers = RendererProfileTimers::new();
 
         let profile_samplers = {
             let _gm = self.gpu_profile.start_marker("build samples");
             // Block CPU waiting for last frame's GPU profiles to arrive.
             // In general this shouldn't block unless heavily GPU limited.
             let (gpu_frame_id, timers, samplers) = self.gpu_profile.build_samples();
@@ -2383,20 +2392,31 @@ impl Renderer {
                 });
             // don't clear the framebuffer if one of the rendered documents will overwrite it
             if needs_clear {
                 let clear_color = self.clear_color.map(|color| color.to_array());
                 self.device.bind_draw_target(None, None);
                 self.device.clear_target(clear_color, None);
             }
 
+            // Re-use whatever targets possible from the pool, before
+            // they get changed/re-allocated by the rendered frames.
+            for doc_with_id in &mut active_documents {
+                self.prepare_tile_frame(&mut doc_with_id.1.frame);
+            }
+
             for &mut (_, RenderedDocument { ref mut frame, .. }) in &mut active_documents {
                 self.update_gpu_cache(frame);
 
-                self.draw_tile_frame(frame, framebuffer_size, cpu_frame_id);
+                self.draw_tile_frame(
+                    frame,
+                    framebuffer_size,
+                    cpu_frame_id,
+                    &mut stats
+                );
 
                 if self.debug_flags.contains(DebugFlags::PROFILER_DBG) {
                     frame_profiles.push(frame.profile_counters.clone());
                 }
             }
 
             self.unlock_external_images();
             self.active_documents = active_documents;
@@ -2440,17 +2460,17 @@ impl Renderer {
         profile_timers.cpu_time.profile(|| {
             let _gm = self.gpu_profile.start_marker("end frame");
             self.gpu_profile.end_frame();
             self.device.end_frame();
         });
         self.last_time = current_time;
 
         if self.renderer_errors.is_empty() {
-            Ok(())
+            Ok(stats)
         } else {
             Err(mem::replace(&mut self.renderer_errors, Vec::new()))
         }
     }
 
     pub fn layers_are_bouncing_back(&self) -> bool {
         self.active_documents
             .iter()
@@ -2568,16 +2588,17 @@ impl Renderer {
         }
     }
 
     fn draw_instanced_batch<T>(
         &mut self,
         data: &[T],
         vertex_array_kind: VertexArrayKind,
         textures: &BatchTextures,
+        stats: &mut RendererStats,
     ) {
         for i in 0 .. textures.colors.len() {
             self.texture_resolver.bind(
                 &textures.colors[i],
                 TextureSampler::color(i),
                 &mut self.device,
             );
         }
@@ -2598,36 +2619,39 @@ impl Renderer {
         let batched = !self.debug_flags.contains(DebugFlags::DISABLE_BATCHING);
 
         if batched {
             self.device
                 .update_vao_instances(vao, data, VertexUsageHint::Stream);
             self.device
                 .draw_indexed_triangles_instanced_u16(6, data.len() as i32);
             self.profile_counters.draw_calls.inc();
+            stats.total_draw_calls += 1;
         } else {
             for i in 0 .. data.len() {
                 self.device
                     .update_vao_instances(vao, &data[i .. i + 1], VertexUsageHint::Stream);
                 self.device.draw_triangles_u16(0, 6);
                 self.profile_counters.draw_calls.inc();
+                stats.total_draw_calls += 1;
             }
         }
 
         self.profile_counters.vertices.add(6 * data.len());
     }
 
     fn submit_batch(
         &mut self,
         key: &BatchKey,
         instances: &[PrimitiveInstance],
         projection: &Transform3D<f32>,
         render_tasks: &RenderTaskTree,
         render_target: Option<(&Texture, i32)>,
         framebuffer_size: DeviceUintSize,
+        stats: &mut RendererStats,
     ) {
         match key.kind {
             BatchKind::Composite { .. } => {
                 self.ps_composite.bind(&mut self.device, projection, 0, &mut self.renderer_errors);
             }
             BatchKind::HardwareComposite => {
                 self.ps_hw_composite
                     .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
@@ -2836,17 +2860,22 @@ impl Renderer {
             self.device.blit_render_target(src, dest);
 
             // Restore draw target to current pass render target + layer.
             // Note: leaving the viewport unchanged, it's not a part of FBO state
             self.device.bind_draw_target(render_target, None);
         }
 
         let _timer = self.gpu_profile.start_timer(key.kind.gpu_sampler_tag());
-        self.draw_instanced_batch(instances, VertexArrayKind::Primitive, &key.textures);
+        self.draw_instanced_batch(
+            instances,
+            VertexArrayKind::Primitive,
+            &key.textures,
+            stats
+        );
     }
 
     fn handle_scaling(
         &mut self,
         render_tasks: &RenderTaskTree,
         scalings: &Vec<ScalingInfo>,
         source: SourceTexture,
     ) {
@@ -2873,17 +2902,20 @@ impl Renderer {
         render_target: Option<(&Texture, i32)>,
         target: &ColorRenderTarget,
         framebuffer_target_rect: DeviceUintRect,
         target_size: DeviceUintSize,
         clear_color: Option<[f32; 4]>,
         render_tasks: &RenderTaskTree,
         projection: &Transform3D<f32>,
         frame_id: FrameId,
+        stats: &mut RendererStats,
     ) {
+        let _gm = self.gpu_profile.start_marker("color target");
+
         // sanity check for the depth buffer
         if let Some((texture, _)) = render_target {
             assert!(texture.has_depth() >= target.needs_depth());
         }
 
         {
             let _timer = self.gpu_profile.start_timer(GPU_TAG_SETUP_TARGET);
             self.device
@@ -2942,24 +2974,26 @@ impl Renderer {
             self.cs_blur_rgba8
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
 
             if !target.vertical_blurs.is_empty() {
                 self.draw_instanced_batch(
                     &target.vertical_blurs,
                     VertexArrayKind::Blur,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
 
             if !target.horizontal_blurs.is_empty() {
                 self.draw_instanced_batch(
                     &target.horizontal_blurs,
                     VertexArrayKind::Blur,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
         }
 
         self.handle_scaling(render_tasks, &target.scalings, SourceTexture::CacheRGBA8);
 
         // Draw any textrun caches for this target. For now, this
         // is only used to cache text runs that are to be blurred
@@ -2974,32 +3008,34 @@ impl Renderer {
             let _timer = self.gpu_profile.start_timer(GPU_TAG_CACHE_TEXT_RUN);
             self.cs_text_run
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
             for (texture_id, instances) in &target.text_run_cache_prims {
                 self.draw_instanced_batch(
                     instances,
                     VertexArrayKind::Primitive,
                     &BatchTextures::color(*texture_id),
+                    stats,
                 );
             }
         }
         if !target.line_cache_prims.is_empty() {
             // TODO(gw): Technically, we don't need blend for solid
             //           lines. We could check that here?
             self.device.set_blend(true);
             self.device.set_blend_mode_premultiplied_alpha();
 
             let _timer = self.gpu_profile.start_timer(GPU_TAG_CACHE_LINE);
             self.cs_line
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
             self.draw_instanced_batch(
                 &target.line_cache_prims,
                 VertexArrayKind::Primitive,
                 &BatchTextures::no_texture(),
+                stats,
             );
         }
 
         //TODO: record the pixel count for cached primitives
 
         if !target.alpha_batcher.is_empty() {
             let _gl = self.gpu_profile.start_marker("alpha batches");
             self.device.set_blend(false);
@@ -3025,16 +3061,17 @@ impl Renderer {
                 {
                     self.submit_batch(
                         &batch.key,
                         &batch.instances,
                         &projection,
                         render_tasks,
                         render_target,
                         target_size,
+                        stats,
                     );
                 }
 
                 self.device.disable_depth_write();
                 self.gpu_profile.finish_sampler(opaque_sampler);
             }
 
             let transparent_sampler = self.gpu_profile.start_sampler(GPU_SAMPLER_TAG_TRANSPARENT);
@@ -3078,17 +3115,18 @@ impl Renderer {
                                     projection,
                                     TextShaderMode::from(glyph_format),
                                     &mut self.renderer_errors,
                                 );
 
                                 self.draw_instanced_batch(
                                     &batch.instances,
                                     VertexArrayKind::Primitive,
-                                    &batch.key.textures
+                                    &batch.key.textures,
+                                    stats,
                                 );
                             }
                             BlendMode::SubpixelConstantTextColor(color) => {
                                 self.device.set_blend_mode_subpixel_constant_text_color(color);
 
                                 self.ps_text_run.bind(
                                     &mut self.device,
                                     glyph_format,
@@ -3096,17 +3134,18 @@ impl Renderer {
                                     projection,
                                     TextShaderMode::SubpixelConstantTextColor,
                                     &mut self.renderer_errors,
                                 );
 
                                 self.draw_instanced_batch(
                                     &batch.instances,
                                     VertexArrayKind::Primitive,
-                                    &batch.key.textures
+                                    &batch.key.textures,
+                                    stats,
                                 );
                             }
                             BlendMode::SubpixelVariableTextColor => {
                                 // Using the two pass component alpha rendering technique:
                                 //
                                 // http://anholt.livejournal.com/32058.html
                                 //
                                 self.device.set_blend_mode_subpixel_pass0();
@@ -3118,17 +3157,18 @@ impl Renderer {
                                     projection,
                                     TextShaderMode::SubpixelPass0,
                                     &mut self.renderer_errors,
                                 );
 
                                 self.draw_instanced_batch(
                                     &batch.instances,
                                     VertexArrayKind::Primitive,
-                                    &batch.key.textures
+                                    &batch.key.textures,
+                                    stats,
                                 );
 
                                 self.device.set_blend_mode_subpixel_pass1();
 
                                 self.ps_text_run.bind(
                                     &mut self.device,
                                     glyph_format,
                                     transform_kind,
@@ -3159,17 +3199,18 @@ impl Renderer {
                                     projection,
                                     TextShaderMode::SubpixelWithBgColorPass0,
                                     &mut self.renderer_errors,
                                 );
 
                                 self.draw_instanced_batch(
                                     &batch.instances,
                                     VertexArrayKind::Primitive,
-                                    &batch.key.textures
+                                    &batch.key.textures,
+                                    stats,
                                 );
 
                                 self.device.set_blend_mode_subpixel_with_bg_color_pass1();
 
                                 self.ps_text_run_subpx_bg_pass1.bind(
                                     &mut self.device,
                                     glyph_format,
                                     transform_kind,
@@ -3232,16 +3273,17 @@ impl Renderer {
 
                         self.submit_batch(
                             &batch.key,
                             &batch.instances,
                             &projection,
                             render_tasks,
                             render_target,
                             target_size,
+                            stats,
                         );
                     }
                 }
             }
 
             self.device.disable_depth();
             self.device.set_blend(false);
             self.gpu_profile.finish_sampler(transparent_sampler);
@@ -3282,17 +3324,19 @@ impl Renderer {
 
     fn draw_alpha_target(
         &mut self,
         render_target: (&Texture, i32),
         target: &AlphaRenderTarget,
         target_size: DeviceUintSize,
         projection: &Transform3D<f32>,
         render_tasks: &RenderTaskTree,
+        stats: &mut RendererStats,
     ) {
+        let _gm = self.gpu_profile.start_marker("alpha target");
         let alpha_sampler = self.gpu_profile.start_sampler(GPU_SAMPLER_TAG_ALPHA);
 
         {
             let _timer = self.gpu_profile.start_timer(GPU_TAG_SETUP_TARGET);
             self.device
                 .bind_draw_target(Some(render_target), Some(target_size));
             self.device.disable_depth();
             self.device.disable_depth_write();
@@ -3327,125 +3371,137 @@ impl Renderer {
             self.cs_blur_a8
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
 
             if !target.vertical_blurs.is_empty() {
                 self.draw_instanced_batch(
                     &target.vertical_blurs,
                     VertexArrayKind::Blur,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
 
             if !target.horizontal_blurs.is_empty() {
                 self.draw_instanced_batch(
                     &target.horizontal_blurs,
                     VertexArrayKind::Blur,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
         }
 
         self.handle_scaling(render_tasks, &target.scalings, SourceTexture::CacheA8);
 
         if !target.brush_mask_corners.is_empty() {
             self.device.set_blend(false);
 
             let _timer = self.gpu_profile.start_timer(GPU_TAG_BRUSH_MASK);
             self.brush_mask_corner
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
             self.draw_instanced_batch(
                 &target.brush_mask_corners,
                 VertexArrayKind::Primitive,
                 &BatchTextures::no_texture(),
+                stats,
             );
         }
 
         if !target.brush_mask_rounded_rects.is_empty() {
             self.device.set_blend(false);
 
             let _timer = self.gpu_profile.start_timer(GPU_TAG_BRUSH_MASK);
             self.brush_mask_rounded_rect
                 .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
             self.draw_instanced_batch(
                 &target.brush_mask_rounded_rects,
                 VertexArrayKind::Primitive,
                 &BatchTextures::no_texture(),
+                stats,
             );
         }
 
         // Draw the clip items into the tiled alpha mask.
         {
             let _timer = self.gpu_profile.start_timer(GPU_TAG_CACHE_CLIP);
 
             // If we have border corner clips, the first step is to clear out the
             // area in the clip mask. This allows drawing multiple invididual clip
             // in regions below.
             if !target.clip_batcher.border_clears.is_empty() {
-                let _gm = self.gpu_profile.start_marker("clip borders [clear]");
+                let _gm2 = self.gpu_profile.start_marker("clip borders [clear]");
                 self.device.set_blend(false);
                 self.cs_clip_border
                     .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
                 self.draw_instanced_batch(
                     &target.clip_batcher.border_clears,
                     VertexArrayKind::Clip,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
 
             // Draw any dots or dashes for border corners.
             if !target.clip_batcher.borders.is_empty() {
-                let _gm = self.gpu_profile.start_marker("clip borders");
+                let _gm2 = self.gpu_profile.start_marker("clip borders");
                 // We are masking in parts of the corner (dots or dashes) here.
                 // Blend mode is set to max to allow drawing multiple dots.
                 // The individual dots and dashes in a border never overlap, so using
                 // a max blend mode here is fine.
                 self.device.set_blend(true);
                 self.device.set_blend_mode_max();
                 self.cs_clip_border
                     .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
                 self.draw_instanced_batch(
                     &target.clip_batcher.borders,
                     VertexArrayKind::Clip,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
 
             // switch to multiplicative blending
             self.device.set_blend(true);
             self.device.set_blend_mode_multiply();
 
             // draw rounded cornered rectangles
             if !target.clip_batcher.rectangles.is_empty() {
-                let _gm = self.gpu_profile.start_marker("clip rectangles");
+                let _gm2 = self.gpu_profile.start_marker("clip rectangles");
                 self.cs_clip_rectangle.bind(
                     &mut self.device,
                     projection,
                     0,
                     &mut self.renderer_errors,
                 );
                 self.draw_instanced_batch(
                     &target.clip_batcher.rectangles,
                     VertexArrayKind::Clip,
                     &BatchTextures::no_texture(),
+                    stats,
                 );
             }
             // draw image masks
             for (mask_texture_id, items) in target.clip_batcher.images.iter() {
-                let _gm = self.gpu_profile.start_marker("clip images");
+                let _gm2 = self.gpu_profile.start_marker("clip images");
                 let textures = BatchTextures {
                     colors: [
                         mask_texture_id.clone(),
                         SourceTexture::Invalid,
                         SourceTexture::Invalid,
                     ],
                 };
                 self.cs_clip_image
                     .bind(&mut self.device, projection, 0, &mut self.renderer_errors);
-                self.draw_instanced_batch(items, VertexArrayKind::Clip, &textures);
+                self.draw_instanced_batch(
+                    items,
+                    VertexArrayKind::Clip,
+                    &textures,
+                    stats,
+                );
             }
         }
 
         self.gpu_profile.finish_sampler(alpha_sampler);
     }
 
     fn update_deferred_resolves(&mut self, frame: &Frame) {
         // The first thing we do is run through any pending deferred
@@ -3524,54 +3580,91 @@ impl Renderer {
 
             for (ext_data, _) in self.texture_resolver.external_images.drain() {
                 handler.unlock(ext_data.0, ext_data.1);
             }
         }
     }
 
     fn prepare_target_list<T: RenderTarget>(
+        &mut self,
         list: &mut RenderTargetList<T>,
-        device: &mut Device,
-        target_pool: &mut Vec<Texture>,
-        format: ImageFormat,
+        perfect_only: bool,
     ) {
         debug_assert_ne!(list.max_size, DeviceUintSize::zero());
-        debug_assert!(list.texture.is_none());
         if list.targets.is_empty() {
             return;
         }
-        let mut texture = match target_pool.pop() {
-            Some(texture) => texture,
-            None => device.create_texture(TextureTarget::Array),
+        let mut texture = if perfect_only {
+            debug_assert!(list.texture.is_none());
+
+            let selector = TargetSelector {
+                size: list.max_size,
+                num_layers: list.targets.len() as _,
+                format: list.format,
+            };
+            let index = self.render_target_pool
+                .iter()
+                .position(|texture| {
+                    selector == TargetSelector {
+                        size: texture.get_dimensions(),
+                        num_layers: texture.get_render_target_layer_count(),
+                        format: texture.get_format(),
+                    }
+                });
+            match index {
+                Some(pos) => self.render_target_pool.swap_remove(pos),
+                None => return,
+            }
+        } else {
+            if list.texture.is_some() {
+                return
+            }
+            match self.render_target_pool.pop() {
+                Some(texture) => texture,
+                None => self.device.create_texture(TextureTarget::Array),
+            }
         };
-        device.init_texture(
+
+        self.device.init_texture(
             &mut texture,
             list.max_size.width,
             list.max_size.height,
-            format,
+            list.format,
             TextureFilter::Linear,
             Some(RenderTargetInfo {
                 has_depth: list.needs_depth(),
             }),
             list.targets.len() as _,
             None,
         );
         list.texture = Some(texture);
     }
 
-    fn prepare_frame(&mut self, frame: &mut Frame) {
+    fn prepare_tile_frame(&mut self, frame: &mut Frame) {
+        // Init textures and render targets to match this scene.
+        // First pass grabs all the perfectly matching targets from the pool.
+        for pass in &mut frame.passes {
+            if let RenderPassKind::OffScreen { ref mut alpha, ref mut color } = pass.kind {
+                self.prepare_target_list(alpha, true);
+                self.prepare_target_list(color, true);
+            }
+        }
+    }
+
+    fn bind_frame_data(&mut self, frame: &mut Frame) {
         let _timer = self.gpu_profile.start_timer(GPU_TAG_SETUP_DATA);
         self.device.device_pixel_ratio = frame.device_pixel_ratio;
 
-        // Init textures and render targets to match this scene.
+        // Some of the textures are already assigned by `prepare_frame`.
+        // Now re-allocate the space for the rest of the target textures.
         for pass in &mut frame.passes {
             if let RenderPassKind::OffScreen { ref mut alpha, ref mut color } = pass.kind {
-                Self::prepare_target_list(alpha, &mut self.device, &mut self.alpha_render_targets, ImageFormat::A8);
-                Self::prepare_target_list(color, &mut self.device, &mut self.color_render_targets, ImageFormat::BGRA8);
+                self.prepare_target_list(alpha, false);
+                self.prepare_target_list(color, false);
             }
         }
 
         self.node_data_texture
             .update(&mut self.device, &mut frame.node_data);
         self.device
             .bind_texture(TextureSampler::ClipScrollNodes, &self.node_data_texture.texture);
 
@@ -3586,46 +3679,48 @@ impl Renderer {
         debug_assert!(self.texture_resolver.cache_rgba8_texture.is_none());
     }
 
     fn draw_tile_frame(
         &mut self,
         frame: &mut Frame,
         framebuffer_size: DeviceUintSize,
         frame_id: FrameId,
+        stats: &mut RendererStats,
     ) {
         let _gm = self.gpu_profile.start_marker("tile frame draw");
 
         if frame.passes.is_empty() {
             return;
         }
 
         self.device.disable_depth_write();
         self.device.disable_stencil();
         self.device.set_blend(false);
 
-        self.prepare_frame(frame);
-
-        let base_color_target_count = self.color_render_targets.len();
-        let base_alpha_target_count = self.alpha_render_targets.len();
+        self.bind_frame_data(frame);
 
         for (pass_index, pass) in frame.passes.iter_mut().enumerate() {
+            self.gpu_profile.place_marker(&format!("pass {}", pass_index));
+
             self.texture_resolver.bind(
                 &SourceTexture::CacheA8,
                 TextureSampler::CacheA8,
                 &mut self.device,
             );
             self.texture_resolver.bind(
                 &SourceTexture::CacheRGBA8,
                 TextureSampler::CacheRGBA8,
                 &mut self.device,
             );
 
             let (cur_alpha, cur_color) = match pass.kind {
                 RenderPassKind::MainFramebuffer(ref target) => {
+                    stats.color_target_count += 1;
+
                     let clear_color = frame.background_color.map(|color| color.to_array());
                     let projection = Transform3D::ortho(
                         0.0,
                         framebuffer_size.width as f32,
                         framebuffer_size.height as f32,
                         0.0,
                         ORTHO_NEAR_PLANE,
                         ORTHO_FAR_PLANE,
@@ -3635,44 +3730,50 @@ impl Renderer {
                         None,
                         target,
                         frame.inner_rect,
                         framebuffer_size,
                         clear_color,
                         &frame.render_tasks,
                         &projection,
                         frame_id,
+                        stats,
                     );
 
                     (None, None)
                 }
                 RenderPassKind::OffScreen { ref mut alpha, ref mut color } => {
                     assert!(alpha.targets.is_empty() || alpha.texture.is_some());
                     assert!(color.targets.is_empty() || color.texture.is_some());
 
                     for (target_index, target) in alpha.targets.iter().enumerate() {
+                        stats.alpha_target_count += 1;
+
                         let projection = Transform3D::ortho(
                             0.0,
                             alpha.max_size.width as f32,
                             0.0,
                             alpha.max_size.height as f32,
                             ORTHO_NEAR_PLANE,
                             ORTHO_FAR_PLANE,
                         );
 
                         self.draw_alpha_target(
                             (alpha.texture.as_ref().unwrap(), target_index as i32),
                             target,
                             alpha.max_size,
                             &projection,
                             &frame.render_tasks,
+                            stats,
                         );
                     }
 
                     for (target_index, target) in color.targets.iter().enumerate() {
+                        stats.color_target_count += 1;
+
                         let projection = Transform3D::ortho(
                             0.0,
                             color.max_size.width as f32,
                             0.0,
                             color.max_size.height as f32,
                             ORTHO_NEAR_PLANE,
                             ORTHO_FAR_PLANE,
                         );
@@ -3681,44 +3782,42 @@ impl Renderer {
                             Some((color.texture.as_ref().unwrap(), target_index as i32)),
                             target,
                             frame.inner_rect,
                             color.max_size,
                             Some([0.0, 0.0, 0.0, 0.0]),
                             &frame.render_tasks,
                             &projection,
                             frame_id,
+                            stats,
                         );
                     }
 
                     (alpha.texture.take(), color.texture.take())
                 }
             };
 
             self.texture_resolver.end_pass(
                 cur_alpha,
                 cur_color,
-                &mut self.alpha_render_targets,
-                &mut self.color_render_targets,
+                &mut self.render_target_pool,
             );
 
             // After completing the first pass, make the A8 target available as an
             // input to any subsequent passes.
             if pass_index == 0 {
                 if let Some(shared_alpha_texture) =
                     self.texture_resolver.resolve(&SourceTexture::CacheA8)
                 {
                     self.device
                         .bind_texture(TextureSampler::SharedCacheA8, shared_alpha_texture);
                 }
             }
         }
 
-        self.color_render_targets[base_color_target_count..].reverse();
-        self.alpha_render_targets[base_alpha_target_count..].reverse();
         self.draw_render_target_debug(framebuffer_size);
         self.draw_texture_cache_debug(framebuffer_size);
 
         // Garbage collect any frame outputs that weren't used this frame.
         let device = &mut self.device;
         self.output_targets
             .retain(|_, target| if target.last_access != frame_id {
                 device.delete_fbo(target.fbo_id);
@@ -3774,33 +3873,29 @@ impl Renderer {
     fn draw_render_target_debug(&mut self, framebuffer_size: DeviceUintSize) {
         if !self.debug_flags.contains(DebugFlags::RENDER_TARGET_DBG) {
             return;
         }
 
         let mut spacing = 16;
         let mut size = 512;
         let fb_width = framebuffer_size.width as i32;
-        let num_layers: i32 = self.color_render_targets
+        let num_layers: i32 = self.render_target_pool
             .iter()
-            .chain(self.alpha_render_targets.iter())
             .map(|texture| texture.get_render_target_layer_count() as i32)
             .sum();
 
         if num_layers * (size + spacing) > fb_width {
             let factor = fb_width as f32 / (num_layers * (size + spacing)) as f32;
             size = (size as f32 * factor) as i32;
             spacing = (spacing as f32 * factor) as i32;
         }
 
         let mut target_index = 0;
-        for texture in self.color_render_targets
-            .iter()
-            .chain(self.alpha_render_targets.iter())
-        {
+        for texture in &self.render_target_pool {
             let dimensions = texture.get_dimensions();
             let src_rect = DeviceIntRect::new(DeviceIntPoint::zero(), dimensions.to_i32());
 
             let layer_count = texture.get_render_target_layer_count();
             for layer_index in 0 .. layer_count {
                 self.device
                     .bind_read_target(Some((texture, layer_index as i32)));
                 let x = fb_width - (spacing + size) * (target_index + 1);
@@ -3901,20 +3996,17 @@ impl Renderer {
         //Note: this is a fake frame, only needed because texture deletion is require to happen inside a frame
         self.device.begin_frame();
         self.gpu_cache_texture.deinit(&mut self.device);
         if let Some(dither_matrix_texture) = self.dither_matrix_texture {
             self.device.delete_texture(dither_matrix_texture);
         }
         self.node_data_texture.deinit(&mut self.device);
         self.render_task_texture.deinit(&mut self.device);
-        for texture in self.alpha_render_targets {
-            self.device.delete_texture(texture);
-        }
-        for texture in self.color_render_targets {
+        for texture in self.render_target_pool {
             self.device.delete_texture(texture);
         }
         self.device.delete_pbo(self.texture_cache_upload_pbo);
         self.texture_resolver.deinit(&mut self.device);
         self.device.delete_vao(self.prim_vao);
         self.device.delete_vao(self.clip_vao);
         self.device.delete_vao(self.blur_vao);
         self.debug.deinit(&mut self.device);
@@ -4072,8 +4164,28 @@ pub struct DebugServer;
 #[cfg(not(feature = "debugger"))]
 impl DebugServer {
     pub fn new(_: MsgSender<ApiMsg>) -> DebugServer {
         DebugServer
     }
 
     pub fn send(&mut self, _: String) {}
 }
+
+// Some basic statistics about the rendered scene
+// that we can use in wrench reftests to ensure that
+// tests are batching and/or allocating on render
+// targets as we expect them to.
+pub struct RendererStats {
+    pub total_draw_calls: usize,
+    pub alpha_target_count: usize,
+    pub color_target_count: usize,
+}
+
+impl RendererStats {
+    pub fn empty() -> RendererStats {
+        RendererStats {
+            total_draw_calls: 0,
+            alpha_target_count: 0,
+            color_target_count: 0,
+        }
+    }
+}
--- a/gfx/webrender/src/texture_cache.rs
+++ b/gfx/webrender/src/texture_cache.rs
@@ -5,17 +5,17 @@
 use api::{DeviceUintPoint, DeviceUintRect, DeviceUintSize};
 use api::{ExternalImageType, ImageData, ImageFormat};
 use api::ImageDescriptor;
 use device::TextureFilter;
 use frame::FrameId;
 use freelist::{FreeList, FreeListHandle, UpsertResult, WeakFreeListHandle};
 use gpu_cache::{GpuCache, GpuCacheHandle};
 use internal_types::{CacheTextureId, TextureUpdateList, TextureUpdateSource};
-use internal_types::{SourceTexture, TextureUpdate, TextureUpdateOp};
+use internal_types::{RenderTargetInfo, SourceTexture, TextureUpdate, TextureUpdateOp};
 use profiler::{ResourceProfileCounter, TextureCacheProfileCounters};
 use resource_cache::CacheItem;
 use std::cmp;
 use std::mem;
 
 // The fixed number of layers for the shared texture cache.
 // There is one array texture per image format, allocated lazily.
 const TEXTURE_ARRAY_LAYERS_LINEAR: usize = 4;
@@ -576,17 +576,24 @@ impl TextureCache {
 
             let update_op = TextureUpdate {
                 id: texture_id,
                 op: TextureUpdateOp::Create {
                     width: TEXTURE_LAYER_DIMENSIONS,
                     height: TEXTURE_LAYER_DIMENSIONS,
                     format: descriptor.format,
                     filter: texture_array.filter,
-                    render_target: None,
+                    // TODO(gw): Creating a render target here is only used
+                    //           for the texture cache debugger display. In
+                    //           the future, we should change the debug
+                    //           display to use a shader that blits the
+                    //           texture, and then we can remove this
+                    //           memory allocation (same for the other
+                    //           standalone texture below).
+                    render_target: Some(RenderTargetInfo { has_depth: false }),
                     layer_count: texture_array.layer_count as i32,
                 },
             };
             self.pending_updates.push(update_op);
 
             texture_array.texture_id = Some(texture_id);
         }
 
@@ -665,17 +672,17 @@ impl TextureCache {
             // of the right size / format.
             let update_op = TextureUpdate {
                 id: texture_id,
                 op: TextureUpdateOp::Create {
                     width: descriptor.width,
                     height: descriptor.height,
                     format: descriptor.format,
                     filter,
-                    render_target: None,
+                    render_target: Some(RenderTargetInfo { has_depth: false }),
                     layer_count: 1,
                 },
             };
             self.pending_updates.push(update_op);
 
             new_cache_entry = Some(CacheEntry::new_standalone(
                 texture_id,
                 size,
--- a/gfx/webrender/src/tiling.rs
+++ b/gfx/webrender/src/tiling.rs
@@ -1,15 +1,16 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{BorderRadiusKind, ClipId, ColorF, DeviceIntPoint, ImageKey};
 use api::{DeviceIntRect, DeviceIntSize, DeviceUintPoint, DeviceUintRect, DeviceUintSize};
-use api::{DocumentLayer, ExternalImageType, FilterOp, FontRenderMode, ImageRendering};
+use api::{DocumentLayer, ExternalImageType, FilterOp, FontRenderMode};
+use api::{ImageFormat, ImageRendering};
 use api::{LayerRect, MixBlendMode, PipelineId};
 use api::{TileOffset, YuvColorSpace, YuvFormat};
 use api::{LayerToWorldTransform, WorldPixel};
 use border::{BorderCornerInstance, BorderCornerSide};
 use clip::{ClipSource, ClipStore};
 use clip_scroll_tree::{ClipScrollTree, CoordinateSystemId};
 use device::Texture;
 use euclid::{TypedTransform3D, vec3};
@@ -1253,27 +1254,30 @@ pub trait RenderTarget {
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
 pub enum RenderTargetKind {
     Color, // RGBA32
     Alpha, // R8
 }
 
 pub struct RenderTargetList<T> {
     screen_size: DeviceIntSize,
+    pub format: ImageFormat,
     pub max_size: DeviceUintSize,
     pub targets: Vec<T>,
     pub texture: Option<Texture>,
 }
 
 impl<T: RenderTarget> RenderTargetList<T> {
     fn new(
         screen_size: DeviceIntSize,
+        format: ImageFormat,
     ) -> Self {
         RenderTargetList {
             screen_size,
+            format,
             max_size: DeviceUintSize::new(MIN_TARGET_SIZE, MIN_TARGET_SIZE),
             targets: Vec::new(),
             texture: None,
         }
     }
 
     fn build(
         &mut self,
@@ -1748,18 +1752,18 @@ impl RenderPass {
             tasks: vec![],
             dynamic_tasks: FastHashMap::default(),
         }
     }
 
     pub fn new_off_screen(screen_size: DeviceIntSize) -> Self {
         RenderPass {
             kind: RenderPassKind::OffScreen {
-                color: RenderTargetList::new(screen_size),
-                alpha: RenderTargetList::new(screen_size),
+                color: RenderTargetList::new(screen_size, ImageFormat::BGRA8),
+                alpha: RenderTargetList::new(screen_size, ImageFormat::A8),
             },
             tasks: vec![],
             dynamic_tasks: FastHashMap::default(),
         }
     }
 
     pub fn add_render_task(
         &mut self,
--- a/gfx/webrender/src/util.rs
+++ b/gfx/webrender/src/util.rs
@@ -1,21 +1,21 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 use api::{BorderRadius, ComplexClipRegion, DeviceIntPoint, DeviceIntRect, DeviceIntSize};
-use api::{DevicePoint, DeviceRect, DeviceSize, LayerRect, LayerToWorldTransform};
-use api::{LayoutPoint, LayoutRect, LayoutSize};
-use api::WorldPoint3D;
+use api::{DevicePoint, DeviceRect, DeviceSize, LayerPoint, LayerRect, LayerSize};
+use api::{LayerToWorldTransform, LayoutPoint, LayoutRect, LayoutSize, WorldPoint3D};
 use euclid::{Point2D, Rect, Size2D, TypedPoint2D, TypedRect, TypedSize2D, TypedTransform2D};
 use euclid::TypedTransform3D;
 use num_traits::Zero;
 use std::f32::consts::FRAC_1_SQRT_2;
 use std::i32;
+use std::f32;
 
 // Matches the definition of SK_ScalarNearlyZero in Skia.
 const NEARLY_ZERO: f32 = 1.0 / 4096.0;
 
 // TODO: Implement these in euclid!
 pub trait MatrixHelpers<Src, Dst> {
     fn transform_rect(&self, rect: &TypedRect<f32, Src>) -> TypedRect<f32, Dst>;
     fn is_identity(&self) -> bool;
@@ -205,17 +205,17 @@ pub fn get_normal(x: f32) -> Option<f32>
     if x.is_normal() {
         Some(x)
     } else {
         None
     }
 }
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
-#[repr(u8)]
+#[repr(u32)]
 pub enum TransformedRectKind {
     AxisAligned = 0,
     Complex = 1,
 }
 
 #[derive(Debug, Clone)]
 pub struct TransformedRect {
     pub local_rect: LayerRect,
@@ -435,16 +435,25 @@ pub mod test {
         assert_eq!(m1.inverse_project(&p0), Some(Point2D::new(2.0, 2.0)));
     }
 }
 
 pub trait MaxRect {
     fn max_rect() -> Self;
 }
 
+impl MaxRect for LayerRect {
+    fn max_rect() -> Self {
+        LayerRect::new(
+            LayerPoint::new(f32::MIN / 2.0, f32::MIN / 2.0),
+            LayerSize::new(f32::MAX, f32::MAX),
+        )
+    }
+}
+
 impl MaxRect for DeviceIntRect {
     fn max_rect() -> Self {
         DeviceIntRect::new(
             DeviceIntPoint::new(i32::MIN / 2, i32::MIN / 2),
             DeviceIntSize::new(i32::MAX, i32::MAX),
         )
     }
 }