Bug 1301854 - detect BOM when determining resource encoding; r?pbro draft
authorTom Tromey <tom@tromey.com>
Fri, 16 Sep 2016 13:02:33 -0600
changeset 415529 a15822bdab9b5f4ab8585ae7af2f41e9b92517e0
parent 415105 d6809e466fe6c6cdeebeebe8878ee0d03e59d4f6
child 531632 ead3073c6da71ff67447ffc8e1f25cd66c2fedd0
push id29896
push userbmo:ttromey@mozilla.com
push dateTue, 20 Sep 2016 16:09:39 +0000
reviewerspbro
bugs1301854
milestone51.0a1
Bug 1301854 - detect BOM when determining resource encoding; r?pbro MozReview-Commit-ID: 6akivZSRwVN
devtools/client/styleeditor/test/browser.ini
devtools/client/styleeditor/test/browser_styleeditor_bom.js
devtools/client/styleeditor/test/utf-16.css
devtools/server/actors/styleeditor.js
devtools/server/actors/stylesheets.js
devtools/shared/DevToolsUtils.js
devtools/shared/tests/unit/test_fetch-bom.js
devtools/shared/tests/unit/xpcshell.ini
--- a/devtools/client/styleeditor/test/browser.ini
+++ b/devtools/client/styleeditor/test/browser.ini
@@ -45,26 +45,28 @@ support-files =
   sourcemaps-watching.html
   test_private.css
   test_private.html
   doc_long.css
   doc_uncached.css
   doc_uncached.html
   doc_xulpage.xul
   sync.html
+  utf-16.css
   !/devtools/client/commandline/test/helpers.js
   !/devtools/client/framework/test/shared-head.js
   !/devtools/client/inspector/shared/test/head.js
   !/devtools/client/inspector/test/head.js
   !/devtools/client/inspector/test/shared-head.js
   !/devtools/client/shared/test/test-actor-registry.js
   !/devtools/client/shared/test/test-actor.js
 
 [browser_styleeditor_autocomplete.js]
 [browser_styleeditor_autocomplete-disabled.js]
+[browser_styleeditor_bom.js]
 [browser_styleeditor_bug_740541_iframes.js]
 [browser_styleeditor_bug_851132_middle_click.js]
 [browser_styleeditor_bug_870339.js]
 [browser_styleeditor_cmd_edit.js]
 [browser_styleeditor_enabled.js]
 [browser_styleeditor_fetch-from-cache.js]
 [browser_styleeditor_filesave.js]
 [browser_styleeditor_highlight-selector.js]
new file mode 100644
--- /dev/null
+++ b/devtools/client/styleeditor/test/browser_styleeditor_bom.js
@@ -0,0 +1,34 @@
+/* vim: set ts=2 et sw=2 tw=80: */
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+const BOM_CSS = TEST_BASE_HTTPS + "utf-16.css";
+const DOCUMENT = "data:text/html;charset=UTF-8," +
+        encodeURIComponent(
+          ["<!DOCTYPE html>",
+           "<html>",
+           " <head>",
+           "  <title>Bug 1301854</title>",
+           '  <link rel="stylesheet" type="text/css" href="' + BOM_CSS + '">',
+           " </head>",
+           " <body>",
+           " </body>",
+           "</html>"
+          ].join("\n"));
+
+const CONTENTS = "// Note that this file must be utf-16 with a " +
+      "BOM for the test to make sense.\n";
+
+add_task(function* () {
+  let {ui} = yield openStyleEditorForURL(DOCUMENT);
+
+  is(ui.editors.length, 1, "correct number of editors");
+
+  let editor = ui.editors[0];
+  yield editor.getSourceEditor();
+
+  let text = editor.sourceEditor.getText();
+  is(text, CONTENTS, "editor contains expected text");
+});
new file mode 100644
index 0000000000000000000000000000000000000000..92ff5eac5303cd990264d2e45fb949230abd9593
GIT binary patch
literal 156
zc$_VbO%6a%41~Y6Q@jfxHV$BqGmsGCFX4&XV_r#<>9o_C-VdqDQL*9a#5@Cy@M9x~
zSg$^tx^QX})Yj%xyeB3j*L3MFxSSFk1vRC)H}*g8`_TOvUP{Ppm`pi@t6@U)0^une
AhyVZp
--- a/devtools/server/actors/styleeditor.js
+++ b/devtools/server/actors/styleeditor.js
@@ -203,28 +203,20 @@ var OldStyleSheetActor = protocol.ActorC
       this.text = content;
       return content;
     });
   },
 
   /**
    * Get the charset of the stylesheet according to the character set rules
    * defined in <http://www.w3.org/TR/CSS2/syndata.html#charset>.
-   *
-   * @param string channelCharset
-   *        Charset of the source string if set by the HTTP channel.
+   * Note that some of the algorithm is implemented in DevToolsUtils.fetch.
    */
-  _getCSSCharset: function (channelCharset)
+  _getCSSCharset: function ()
   {
-    // StyleSheet's charset can be specified from multiple sources
-    if (channelCharset && channelCharset.length > 0) {
-      // step 1 of syndata.html: charset given in HTTP header.
-      return channelCharset;
-    }
-
     let sheet = this.rawSheet;
     if (sheet) {
       // Do we have a @charset rule in the stylesheet?
       // step 2 of syndata.html (without the BOM check).
       if (sheet.cssRules) {
         let rules = sheet.cssRules;
         if (rules.length
             && rules.item(0).type == Ci.nsIDOMCSSRule.CHARSET_RULE) {
--- a/devtools/server/actors/stylesheets.js
+++ b/devtools/server/actors/stylesheets.js
@@ -660,28 +660,20 @@ var StyleSheetActor = protocol.ActorClas
       }
       return mediaRules;
     });
   },
 
   /**
    * Get the charset of the stylesheet according to the character set rules
    * defined in <http://www.w3.org/TR/CSS2/syndata.html#charset>.
-   *
-   * @param string channelCharset
-   *        Charset of the source string if set by the HTTP channel.
+   * Note that some of the algorithm is implemented in DevToolsUtils.fetch.
    */
-  _getCSSCharset: function (channelCharset)
+  _getCSSCharset: function ()
   {
-    // StyleSheet's charset can be specified from multiple sources
-    if (channelCharset && channelCharset.length > 0) {
-      // step 1 of syndata.html: charset given in HTTP header.
-      return channelCharset;
-    }
-
     let sheet = this.rawSheet;
     if (sheet) {
       // Do we have a @charset rule in the stylesheet?
       // step 2 of syndata.html (without the BOM check).
       if (sheet.cssRules) {
         let rules = sheet.cssRules;
         if (rules.length
             && rules.item(0).type == Ci.nsIDOMCSSRule.CHARSET_RULE) {
--- a/devtools/shared/DevToolsUtils.js
+++ b/devtools/shared/DevToolsUtils.js
@@ -443,23 +443,41 @@ function mainThreadFetch(aURL, aOptions 
       // might fail and we lose the stream data. This means we can't fall back
       // to using the locale default encoding (bug 1181345).
 
       // Read and decode the data according to the locale default encoding.
       let available = stream.available();
       let source = NetUtil.readInputStreamToString(stream, available);
       stream.close();
 
+      // We do our own BOM sniffing here because there's no convenient
+      // implementation of the "decode" algorithm
+      // (https://encoding.spec.whatwg.org/#decode) exposed to JS.
+      let bomCharset = null;
+      if (available >= 3 && source.codePointAt(0) == 0xef &&
+          source.codePointAt(1) == 0xbb && source.codePointAt(2) == 0xbf) {
+        bomCharset = "UTF-8";
+        source = source.slice(3);
+      } else if (available >= 2 && source.codePointAt(0) == 0xfe &&
+                 source.codePointAt(1) == 0xff) {
+        bomCharset = "UTF-16BE";
+        source = source.slice(2);
+      } else if (available >= 2 && source.codePointAt(0) == 0xff &&
+                 source.codePointAt(1) == 0xfe) {
+        bomCharset = "UTF-16LE";
+        source = source.slice(2);
+      }
+
       // If the channel or the caller has correct charset information, the
       // content will be decoded correctly. If we have to fall back to UTF-8 and
       // the guess is wrong, the conversion fails and convertToUnicode returns
       // the input unmodified. Essentially we try to decode the data as UTF-8
       // and if that fails, we use the locale specific default encoding. This is
       // the best we can do if the source does not provide charset info.
-      let charset = channel.contentCharset || aOptions.charset || "UTF-8";
+      let charset = bomCharset || channel.contentCharset || aOptions.charset || "UTF-8";
       let unicodeSource = NetworkHelper.convertToUnicode(source, charset);
 
       deferred.resolve({
         content: unicodeSource,
         contentType: request.contentType
       });
     } catch (ex) {
       let uri = request.originalURI;
new file mode 100644
--- /dev/null
+++ b/devtools/shared/tests/unit/test_fetch-bom.js
@@ -0,0 +1,76 @@
+/* vim: set ft=javascript ts=2 et sw=2 tw=80: */
+/* Any copyright is dedicated to the Public Domain.
+   http://creativecommons.org/publicdomain/zero/1.0/ */
+"use strict";
+
+// Tests for DevToolsUtils.fetch BOM detection.
+
+const CC = Components.Constructor;
+
+const { HttpServer } = Cu.import("resource://testing-common/httpd.js", {});
+const BinaryOutputStream = CC("@mozilla.org/binaryoutputstream;1",
+                              "nsIBinaryOutputStream", "setOutputStream");
+
+function write8(bos) {
+  bos.write8(0xef);
+  bos.write8(0xbb);
+  bos.write8(0xbf);
+  bos.write8(0x68);
+  bos.write8(0xc4);
+  bos.write8(0xb1);
+}
+
+function write16be(bos) {
+  bos.write8(0xfe);
+  bos.write8(0xff);
+  bos.write8(0x00);
+  bos.write8(0x68);
+  bos.write8(0x01);
+  bos.write8(0x31);
+}
+
+function write16le(bos) {
+  bos.write8(0xff);
+  bos.write8(0xfe);
+  bos.write8(0x68);
+  bos.write8(0x00);
+  bos.write8(0x31);
+  bos.write8(0x01);
+}
+
+function getHandler(writer) {
+  return function (request, response) {
+    response.setStatusLine(request.httpVersion, 200, "OK");
+
+    let bos = new BinaryOutputStream(response.bodyOutputStream);
+    writer(bos);
+  };
+}
+
+const server = new HttpServer();
+server.registerDirectory("/", do_get_cwd());
+server.registerPathHandler("/u8", getHandler(write8));
+server.registerPathHandler("/u16be", getHandler(write16be));
+server.registerPathHandler("/u16le", getHandler(write16le));
+server.start(-1);
+
+const port = server.identity.primaryPort;
+const serverURL = "http://localhost:" + port;
+
+do_register_cleanup(() => {
+  return new Promise(resolve => server.stop(resolve));
+});
+
+add_task(function* () {
+  yield test_one(serverURL + "/u8", "UTF-8");
+  yield test_one(serverURL + "/u16be", "UTF-16BE");
+  yield test_one(serverURL + "/u16le", "UTF-16LE");
+});
+
+function* test_one(url, encoding) {
+  // Be sure to set the encoding to something that will yield an
+  // invalid result if BOM sniffing is not done.
+  yield DevToolsUtils.fetch(url, { charset: "ISO-8859-1" }).then(({content}) => {
+    do_check_eq(content, "hı", "The content looks correct for " + encoding);
+  });
+}
--- a/devtools/shared/tests/unit/xpcshell.ini
+++ b/devtools/shared/tests/unit/xpcshell.ini
@@ -5,16 +5,17 @@ tail =
 firefox-appdir = browser
 skip-if = toolkit == 'android' || toolkit == 'gonk'
 support-files =
   exposeLoader.js
 
 [test_assert.js]
 [test_csslexer.js]
 [test_css-properties-db.js]
+[test_fetch-bom.js]
 [test_fetch-chrome.js]
 [test_fetch-file.js]
 [test_fetch-http.js]
 [test_fetch-resource.js]
 [test_flatten.js]
 [test_indentation.js]
 [test_independent_loaders.js]
 [test_invisible_loader.js]