Bug 562590 - Make incomplete byte sequences near HTML EOF emit a REPLACEMENT CHARACTER. draft
authorHenri Sivonen <hsivonen@hsivonen.fi>
Thu, 22 Jun 2017 14:32:34 +0300
changeset 604032 f3da66a71abccaaed14f6a7e5ae27bae0db85362
parent 604031 798b89d453fa13659e35f17761aa781c1f043eb2
child 636059 db250fd307cc15018e2fccd5ee85f154c7b3ead3
push id66925
push userbmo:hsivonen@hsivonen.fi
push dateWed, 05 Jul 2017 04:56:04 +0000
bugs562590
milestone56.0a1
Bug 562590 - Make incomplete byte sequences near HTML EOF emit a REPLACEMENT CHARACTER. MozReview-Commit-ID: 6NF4rMWxyVu
parser/html/nsHtml5StreamParser.cpp
testing/web-platform/meta/MANIFEST.json
testing/web-platform/tests/encoding/eof-shift_jis-ref.html
testing/web-platform/tests/encoding/eof-shift_jis.html
testing/web-platform/tests/encoding/eof-utf-8-one-ref.html
testing/web-platform/tests/encoding/eof-utf-8-one.html
testing/web-platform/tests/encoding/eof-utf-8-three-ref.html
testing/web-platform/tests/encoding/eof-utf-8-three.html
testing/web-platform/tests/encoding/eof-utf-8-two-ref.html
testing/web-platform/tests/encoding/eof-utf-8-two.html
--- a/parser/html/nsHtml5StreamParser.cpp
+++ b/parser/html/nsHtml5StreamParser.cpp
@@ -829,25 +829,16 @@ nsHtml5StreamParser::WriteStreamBytes(co
   NS_ASSERTION(IsParserThread(), "Wrong thread!");
   // mLastBuffer should always point to a buffer of the size
   // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE.
   if (!mLastBuffer) {
     NS_WARNING("mLastBuffer should not be null!");
     MarkAsBroken(NS_ERROR_NULL_POINTER);
     return NS_ERROR_NULL_POINTER;
   }
-  if (mLastBuffer->getEnd() == NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE) {
-    RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
-      nsHtml5OwningUTF16Buffer::FalliblyCreate(
-        NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
-    if (!newBuf) {
-      return NS_ERROR_OUT_OF_MEMORY;
-    }
-    mLastBuffer = (mLastBuffer->next = newBuf.forget());
-  }
   size_t totalRead = 0;
   auto src = MakeSpan(aFromSegment, aCount);
   for (;;) {
     auto dst = mLastBuffer->TailAsSpan(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
     uint32_t result;
     size_t read;
     size_t written;
     bool hadErrors;
@@ -857,22 +848,20 @@ nsHtml5StreamParser::WriteStreamBytes(co
     src = src.From(read);
     totalRead += read;
     mLastBuffer->AdvanceEnd(written);
     if (result == kOutputFull) {
       RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
         nsHtml5OwningUTF16Buffer::FalliblyCreate(
           NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
       if (!newBuf) {
+        MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
         return NS_ERROR_OUT_OF_MEMORY;
       }
       mLastBuffer = (mLastBuffer->next = newBuf.forget());
-      // All input may have been consumed if there is a pending surrogate pair
-      // that doesn't fit in the output buffer. Loop back to push a zero-length
-      // input to the decoder in that case.
     } else {
       MOZ_ASSERT(totalRead == aCount,
                  "The Unicode decoder consumed the wrong number of bytes.");
       *aWriteCount = totalRead;
       return NS_OK;
     }
   }
 }
@@ -1051,16 +1040,53 @@ nsHtml5StreamParser::DoStopRequest()
     if (NS_FAILED(rv = FinalizeSniffing(nullptr, 0, &writeCount, 0))) {
       MarkAsBroken(rv);
       return;
     }
   } else if (mFeedChardet) {
     mChardet->Done();
   }
 
+  MOZ_ASSERT(mUnicodeDecoder, "Should have a decoder after finalizing sniffing.");
+
+  // mLastBuffer should always point to a buffer of the size
+  // NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE.
+  if (!mLastBuffer) {
+    NS_WARNING("mLastBuffer should not be null!");
+    MarkAsBroken(NS_ERROR_NULL_POINTER);
+    return;
+  }
+
+  Span<uint8_t> src; // empty span
+  for (;;) {
+    auto dst = mLastBuffer->TailAsSpan(NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
+    uint32_t result;
+    size_t read;
+    size_t written;
+    bool hadErrors;
+    Tie(result, read, written, hadErrors) =
+      mUnicodeDecoder->DecodeToUTF16(src, dst, true);
+    Unused << hadErrors;
+    MOZ_ASSERT(read == 0, "How come an empty span was read form?");
+    mLastBuffer->AdvanceEnd(written);
+    if (result == kOutputFull) {
+      RefPtr<nsHtml5OwningUTF16Buffer> newBuf =
+        nsHtml5OwningUTF16Buffer::FalliblyCreate(
+          NS_HTML5_STREAM_PARSER_READ_BUFFER_SIZE);
+      if (!newBuf) {
+        MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
+        return;
+      }
+      mLastBuffer = (mLastBuffer->next = newBuf.forget());
+    } else {
+      break;
+    }
+  }
+
+
   if (IsTerminatedOrInterrupted()) {
     return;
   }
 
   ParseAvailableData();
 }
 
 class nsHtml5RequestStopper : public Runnable
--- a/testing/web-platform/meta/MANIFEST.json
+++ b/testing/web-platform/meta/MANIFEST.json
@@ -9076,16 +9076,64 @@
       [
        "/css/css-transitions-1/reference/transition-test-ref.html",
        "=="
       ]
      ],
      {}
     ]
    ],
+   "encoding/eof-shift_jis.html": [
+    [
+     "/encoding/eof-shift_jis.html",
+     [
+      [
+       "/encoding/eof-shift_jis-ref.html",
+       "=="
+      ]
+     ],
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-one.html": [
+    [
+     "/encoding/eof-utf-8-one.html",
+     [
+      [
+       "/encoding/eof-utf-8-one-ref.html",
+       "=="
+      ]
+     ],
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-three.html": [
+    [
+     "/encoding/eof-utf-8-three.html",
+     [
+      [
+       "/encoding/eof-utf-8-three-ref.html",
+       "=="
+      ]
+     ],
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-two.html": [
+    [
+     "/encoding/eof-utf-8-two.html",
+     [
+      [
+       "/encoding/eof-utf-8-two-ref.html",
+       "=="
+      ]
+     ],
+     {}
+    ]
+   ],
    "html/dom/elements/global-attributes/dir_auto-EN-L.html": [
     [
      "/html/dom/elements/global-attributes/dir_auto-EN-L.html",
      [
       [
        "/html/dom/elements/global-attributes/dir_auto-EN-L-ref.html",
        "=="
       ]
@@ -46288,16 +46336,36 @@
      {}
     ]
    ],
    "encoding/OWNERS": [
     [
      {}
     ]
    ],
+   "encoding/eof-shift_jis-ref.html": [
+    [
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-one-ref.html": [
+    [
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-three-ref.html": [
+    [
+     {}
+    ]
+   ],
+   "encoding/eof-utf-8-two-ref.html": [
+    [
+     {}
+    ]
+   ],
    "encoding/resources/encodings.js": [
     [
      {}
     ]
    ],
    "encoding/resources/single-byte-raw.py": [
     [
      {}
@@ -173643,16 +173711,48 @@
   "encoding/api-surrogates-utf8.html": [
    "e44be4c30e9c65a4b51972efab2e161f166d58a5",
    "testharness"
   ],
   "encoding/big5-encoder.html": [
    "b9635c43ce159e1961106b039dce0e3d04fade34",
    "testharness"
   ],
+  "encoding/eof-shift_jis-ref.html": [
+   "55ac2be8c2cce3bae0ea1e61f5c330c38adc1e9e",
+   "support"
+  ],
+  "encoding/eof-shift_jis.html": [
+   "c5f6bd10724c2186f9c3347a8eca29627c8fcb5d",
+   "reftest"
+  ],
+  "encoding/eof-utf-8-one-ref.html": [
+   "88e83397cb0e375bc9c84ddedfe60aa5cb11a667",
+   "support"
+  ],
+  "encoding/eof-utf-8-one.html": [
+   "8f89ca912cc4d01dc9cb72e027aef26e1d0cb2a6",
+   "reftest"
+  ],
+  "encoding/eof-utf-8-three-ref.html": [
+   "48dbb873550d0fb33c45a4ab4fff03654b1732b2",
+   "support"
+  ],
+  "encoding/eof-utf-8-three.html": [
+   "b04b8002836ea0f476b65270bb2b5797ffec5fdd",
+   "reftest"
+  ],
+  "encoding/eof-utf-8-two-ref.html": [
+   "ef39f0e0af97a88ceaf652a9df0b2cd07719bff5",
+   "support"
+  ],
+  "encoding/eof-utf-8-two.html": [
+   "dd855efd3ea7bd865c35c105506473fb7731bc36",
+   "reftest"
+  ],
   "encoding/gb18030-encoder.html": [
    "6f091a64de33492bd17ecbd37f5db0fff9af499a",
    "testharness"
   ],
   "encoding/gbk-encoder.html": [
    "30bdfc96ffaff0277ceca69aad43d82d5ac691b6",
    "testharness"
   ],
@@ -178656,17 +178756,17 @@
    "ac9d401368b75e00adbdf80ee42dd8dce1e48e13",
    "testharness"
   ],
   "html/browsers/the-window-object/window-indexed-properties.html": [
    "22d5cb06bfc4724d27f565b8ffa2280bf2e8538b",
    "testharness"
   ],
   "html/browsers/the-window-object/window-named-properties.html": [
-   "03bab12397e43003c6a4d768d2faa580501400bf",
+   "21bb2b7a30381decf8b55152ba33cd723b67b8d5",
    "testharness"
   ],
   "html/browsers/the-window-object/window-open-noopener.html": [
    "2e20bfcd1dfe9bee00a9747b87cdaf42004d6415",
    "testharness"
   ],
   "html/browsers/the-window-object/window-properties.html": [
    "ee0ade0a8de422597c362d15cf4a9dd446e4af00",
@@ -203100,17 +203200,17 @@
    "722fea5d23733d822faf5717f66f2ff2cccb62a4",
    "support"
   ],
   "old-tests/submission/Opera/script_scheduling/scripts/include-10.js": [
    "8b70b5a43ab3f1efd9755bf95f9a2713bbe588ce",
    "support"
   ],
   "old-tests/submission/Opera/script_scheduling/scripts/include-11.js": [
-   "af092f1827bebd1e69f3b914049ad4febfcc8dc5",
+   "f26e65bc15505040371e3026cd09c8eb0df0a1c4",
    "support"
   ],
   "old-tests/submission/Opera/script_scheduling/scripts/include-12.js": [
    "7dcd929b6b7dd65553f87ec8b0fbf0ec84c335b2",
    "support"
   ],
   "old-tests/submission/Opera/script_scheduling/scripts/include-2.js": [
    "3a3c18956bb0ccf8c13b301cd3c6f0a6e474ce75",
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-shift_jis-ref.html
@@ -0,0 +1,4 @@
+<!doctype html>
+<meta charset=shift_jis>
+<title>Shift_JIS file ending with a truncated sequence</title>
+One-byte truncated sequence:&#xFFFD;
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-shift_jis.html
@@ -0,0 +1,5 @@
+<!doctype html>
+<meta charset=shift_jis>
+<title>Shift_JIS file ending with a truncated sequence</title>
+<link rel=match href=/encoding/eof-shift_jis-ref.html>
+One-byte truncated sequence:ƒ
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-one-ref.html
@@ -0,0 +1,4 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a one-byte truncated sequence</title>
+One-byte truncated sequence:&#xFFFD;
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-one.html
@@ -0,0 +1,5 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a one-byte truncated sequence</title>
+<link rel=match href="eof-utf-8-one-ref.html">
+One-byte truncated sequence:ð
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-three-ref.html
@@ -0,0 +1,4 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a three-byte truncated sequence</title>
+Three-byte truncated sequence:&#xFFFD;
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-three.html
@@ -0,0 +1,5 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a three-byte truncated sequence</title>
+<link rel=match href="eof-utf-8-three-ref.html">
+Three-byte truncated sequence:ðŸ’
\ No newline at end of file
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-two-ref.html
@@ -0,0 +1,4 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a two-byte truncated sequence</title>
+Two-byte truncated sequence:&#xFFFD;
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/encoding/eof-utf-8-two.html
@@ -0,0 +1,5 @@
+<!doctype html>
+<meta charset=utf-8>
+<title>UTF-8 file ending with a two-byte truncated sequence</title>
+<link rel=match href="eof-utf-8-two-ref.html">
+Two-byte truncated sequence:ðŸ
\ No newline at end of file