Bug 1402247 part 11 - Use U+FFFD when atomizing invalid UTF-8. draft
authorHenri Sivonen <hsivonen@hsivonen.fi>
Wed, 21 Mar 2018 12:26:41 +0200
changeset 771518 ece6c9742bb30761c697ebbe17d3f0881c4fb14d
parent 770054 38feba9d243f4c1a0c9bfcd972a01b6399f2ecad
push id103704
push userbmo:hsivonen@hsivonen.fi
push dateFri, 23 Mar 2018 09:24:55 +0000
bugs1402247
milestone61.0a1
Bug 1402247 part 11 - Use U+FFFD when atomizing invalid UTF-8. MozReview-Commit-ID: ImhErHMWRzl
servo/support/gecko/nsstring/src/conversions.rs
xpcom/ds/nsAtomTable.cpp
xpcom/string/nsReadableUtils.cpp
xpcom/string/nsReadableUtils.h
xpcom/string/nsUTF8Utils.h
xpcom/tests/gtest/TestAtoms.cpp
xpcom/tests/gtest/TestTextFormatter.cpp
xpcom/tests/gtest/TestUTF.cpp
xpcom/tests/gtest/UTFStrings.h
--- a/servo/support/gecko/nsstring/src/conversions.rs
+++ b/servo/support/gecko/nsstring/src/conversions.rs
@@ -247,17 +247,17 @@ impl nsACString {
                     if needed <= available {
                         written = num_ascii + convert_utf16_to_utf8(&other[num_ascii..], &mut buffer[filled..]);
                         (0, 0, 0)
                     } else {
                         (needed, filled, num_ascii)
                     }
                 } else {
                     let needed = times_three_plus_one(other.len()).ok_or(())?;
-                    (needed, 0, 0)
+                    (needed, old_len, 0)
                 };
                 if needed != 0 {
                     let buffer = unsafe {
                         self.fallible_maybe_expand_capacity(filled.checked_add(needed).ok_or(())?)?
                     };
                     written = num_ascii + convert_utf16_to_utf8(&other[num_ascii..], &mut buffer[filled..]);
                 }
             } else {
--- a/xpcom/ds/nsAtomTable.cpp
+++ b/xpcom/ds/nsAtomTable.cpp
@@ -204,28 +204,22 @@ struct AtomTableKey
     : mUTF16String(aUTF16String)
     , mUTF8String(nullptr)
     , mLength(aLength)
   {
     mHash = HashString(mUTF16String, mLength);
     *aHashOut = mHash;
   }
 
-  AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut)
+  AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut, bool* aErr)
     : mUTF16String(nullptr)
     , mUTF8String(aUTF8String)
     , mLength(aLength)
   {
-    bool err;
-    mHash = HashUTF8AsUTF16(mUTF8String, mLength, &err);
-    if (err) {
-      mUTF8String = nullptr;
-      mLength = 0;
-      mHash = 0;
-    }
+    mHash = HashUTF8AsUTF16(mUTF8String, mLength, aErr);
     *aHashOut = mHash;
   }
 
   const char16_t* mUTF16String;
   const char* mUTF8String;
   uint32_t mLength;
   uint32_t mHash;
 };
@@ -725,36 +719,41 @@ NS_Atomize(const char* aUTF8String)
   MOZ_ASSERT(gAtomTable);
   return gAtomTable->Atomize(nsDependentCString(aUTF8String));
 }
 
 already_AddRefed<nsAtom>
 nsAtomTable::Atomize(const nsACString& aUTF8String)
 {
   uint32_t hash;
-  AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash);
+  bool err;
+  AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash, &err);
+  if (MOZ_UNLIKELY(err)) {
+    MOZ_ASSERT_UNREACHABLE("Tried to atomize invalid UTF-8.");
+    // The input was invalid UTF-8. Let's replace the errors with U+FFFD
+    // and atomize the result.
+    nsString str;
+    CopyUTF8toUTF16(aUTF8String, str);
+    return Atomize(str);
+  }
   nsAtomSubTable& table = SelectSubTable(key);
   MutexAutoLock lock(table.mLock);
   AtomTableEntry* he = table.Add(key);
 
   if (he->mAtom) {
     RefPtr<nsAtom> atom = he->mAtom;
 
     return atom.forget();
   }
 
   // This results in an extra addref/release of the nsStringBuffer.
   // Unfortunately there doesn't seem to be any APIs to avoid that.
   // Actually, now there is, sort of: ForgetSharedBuffer.
   nsString str;
-  // If the input was invalid UTF-8, hash is zero and we atomicize
-  // the empty string
-  if (hash) {
-    CopyUTF8toUTF16(aUTF8String, str);
-  }
+  CopyUTF8toUTF16(aUTF8String, str);
   RefPtr<nsAtom> atom = dont_AddRef(new nsDynamicAtom(str, hash));
 
   he->mAtom = atom;
 
   return atom.forget();
 }
 
 already_AddRefed<nsAtom>
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -641,41 +641,37 @@ CompareUTF8toUTF16(const nsACString& aUT
   aUTF8String.BeginReading(u8);
   aUTF8String.EndReading(u8end);
 
   const char16_t* u16;
   const char16_t* u16end;
   aUTF16String.BeginReading(u16);
   aUTF16String.EndReading(u16end);
 
-  bool err = false;
   for (;;) {
     if (u8 == u8end) {
       if (u16 == u16end) {
         return 0;
       }
       return -1;
     }
     if (u16 == u16end) {
       return 1;
     }
     // No need for ASCII optimization, since both NextChar()
-    // calls get inlined. The calls below never set err to false,
-    // so it's OK not to check between the two calls.
-    uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
-    uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end, &err);
-    if (err) {
-      return INT32_MIN;
+    // calls get inlined.
+    uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end);
+    uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end);
+    if (scalar16 == scalar8) {
+      continue;
     }
     if (scalar8 < scalar16) {
       return -1;
     }
-    if (scalar16 > scalar8) {
-      return 1;
-    }
+    return 1;
   }
 }
 
 void
 AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest)
 {
   NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char");
   if (IS_IN_BMP(aSource)) {
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -566,19 +566,19 @@ const nsCString& EmptyCString();
 
 const nsString& VoidString();
 const nsCString& VoidCString();
 
 /**
 * Compare a UTF-8 string to an UTF-16 string.
 *
 * Returns 0 if the strings are equal, -1 if aUTF8String is less
-* than aUTF16Count, and 1 in the reverse case.  In case of fatal
-* error (eg the strings are not valid UTF8 and UTF16 respectively),
-* this method will return INT32_MIN.
+* than aUTF16Count, and 1 in the reverse case. Errors are replaced
+* with U+FFFD and then the U+FFFD is compared as if it had occurred
+* in the input.
 */
 int32_t CompareUTF8toUTF16(const nsACString& aUTF8String,
                            const nsAString& aUTF16String);
 
 void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);
 
 template<class T>
 inline bool
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -50,32 +50,34 @@ public:
 
 /**
  * Extract the next Unicode scalar value from the buffer and return it. The
  * pointer passed in is advanced to the start of the next character in the
  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
  * over the maximal valid prefix and *aErr is set to true (if aErr is not
  * null).
  *
- * Precondition: *aBuffer < aEnd; *aErr == false (if aErr is not null)
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
+ * Precondition: *aBuffer < aEnd
  */
 class UTF8CharEnumerator
 {
 public:
   static inline uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr = nullptr)
   {
     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
     MOZ_ASSERT(aEnd, "null end pointer");
 
     const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
     const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
 
     MOZ_ASSERT(p, "null buffer");
     MOZ_ASSERT(p < end, "Bogus range");
-    MOZ_ASSERT_IF(aErr, !*aErr);
 
     unsigned char first = *p++;
 
     if (MOZ_LIKELY(first < 0x80U)) {
       *aBuffer = reinterpret_cast<const char*>(p);
       return first;
     }
 
@@ -159,31 +161,33 @@ public:
 };
 
 /**
  * Extract the next Unicode scalar value from the buffer and return it. The
  * pointer passed in is advanced to the start of the next character in the
  * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
  * the unpaired surrogate and *aErr is set to true (if aErr is not null).
  *
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
  * Precondition: *aBuffer < aEnd
  */
 class UTF16CharEnumerator
 {
 public:
   static inline uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd, bool* aErr = nullptr)
   {
     MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
     MOZ_ASSERT(aEnd, "null end pointer");
 
     const char16_t* p = *aBuffer;
 
     MOZ_ASSERT(p, "null buffer");
     MOZ_ASSERT(p < aEnd, "Bogus range");
-    MOZ_ASSERT_IF(aErr, !*aErr);
 
     char16_t c = *p++;
 
     // Let's use encoding_rs-style code golf here.
     // Unsigned underflow is defined behavior
     char16_t cMinusSurrogateStart = c - 0xD800U;
     if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
       *aBuffer = p;
--- a/xpcom/tests/gtest/TestAtoms.cpp
+++ b/xpcom/tests/gtest/TestAtoms.cpp
@@ -89,38 +89,42 @@ TEST(Atoms, Invalid)
     {
       RefPtr<nsAtom> atom16 = NS_Atomize(Invalid16Strings[i].m16);
       EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid16Strings[i].m16)));
     }
 
     EXPECT_EQ(count, NS_GetNumberOfAtoms());
   }
 
+// Don't run this test in debug builds as that intentionally asserts.
+#ifndef DEBUG
   for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
     nsrefcnt count = NS_GetNumberOfAtoms();
 
     {
       RefPtr<nsAtom> atom8 = NS_Atomize(Invalid8Strings[i].m8);
       RefPtr<nsAtom> atom16 = NS_Atomize(Invalid8Strings[i].m16);
       EXPECT_EQ(atom16, atom8);
       EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid8Strings[i].m16)));
     }
 
     EXPECT_EQ(count, NS_GetNumberOfAtoms());
   }
 
-// Don't run this test in debug builds as that intentionally asserts.
-#ifndef DEBUG
-  RefPtr<nsAtom> emptyAtom = NS_Atomize("");
-
   for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
     nsrefcnt count = NS_GetNumberOfAtoms();
 
-    RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]);
-    EXPECT_EQ(atom8, emptyAtom);
+    {
+      RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]);
+      nsString str16;
+      CopyUTF8toUTF16(MakeStringSpan(Malformed8Strings[i]), str16);
+      RefPtr<nsAtom> atom16 = NS_Atomize(str16);
+      EXPECT_EQ(atom8, atom16);
+    }
+
     EXPECT_EQ(count, NS_GetNumberOfAtoms());
   }
 #endif
 }
 
 #define FIRST_ATOM_STR "first static atom. Hello!"
 #define SECOND_ATOM_STR "second static atom. @World!"
 #define THIRD_ATOM_STR "third static atom?!"
--- a/xpcom/tests/gtest/TestTextFormatter.cpp
+++ b/xpcom/tests/gtest/TestTextFormatter.cpp
@@ -12,17 +12,16 @@ TEST(TextFormatter, Tests)
   nsAutoString fmt(NS_LITERAL_STRING("%3$s %4$S %1$d %2$d %2$d %3$s"));
   char utf8[] = "Hello";
   char16_t ucs2[]={'W', 'o', 'r', 'l', 'd', 0x4e00, 0xAc00, 0xFF45, 0x0103, 0x00};
   int d=3;
 
   char16_t buf[256];
   nsTextFormatter::snprintf(buf, 256, fmt.get(), d, 333, utf8, ucs2);
   nsAutoString out(buf);
-  ASSERT_STREQ("Hello World", NS_LossyConvertUTF16toASCII(out).get());
 
   const char16_t *uout = out.get();
   const char16_t expected[] = {0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20,
                                 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x4E00,
                                 0xAC00, 0xFF45, 0x0103, 0x20, 0x33,
                                 0x20, 0x33, 0x33, 0x33, 0x20, 0x33,
                                 0x33, 0x33, 0x20, 0x48, 0x65, 0x6C,
                                 0x6C, 0x6F};
--- a/xpcom/tests/gtest/TestUTF.cpp
+++ b/xpcom/tests/gtest/TestUTF.cpp
@@ -102,30 +102,26 @@ TEST(UTF, Hash16)
     EXPECT_EQ(HashString(ValidStrings[i].m16),
               HashUTF8AsUTF16(str8.get(), str8.Length(), &err));
     EXPECT_FALSE(err);
   }
 
   for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
     nsDependentCString str8(Invalid8Strings[i].m8);
     bool err;
-    EXPECT_EQ(HashString(Invalid8Strings[i].m16),
-              HashUTF8AsUTF16(str8.get(), str8.Length(), &err));
-    EXPECT_FALSE(err);
+    EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
+    EXPECT_TRUE(err);
   }
 
-// Don't run this test in debug builds as that intentionally asserts.
-#ifndef DEBUG
   for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
     nsDependentCString str8(Malformed8Strings[i]);
     bool err;
     EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
     EXPECT_TRUE(err);
   }
-#endif
 }
 
 /**
  * This tests the handling of a non-ascii character at various locations in a
  * UTF-16 string that is being converted to UTF-8.
  */
 void NonASCII16_helper(const size_t aStrSize)
 {
--- a/xpcom/tests/gtest/UTFStrings.h
+++ b/xpcom/tests/gtest/UTFStrings.h
@@ -56,40 +56,39 @@ static const UTFStringsStringPair Invali
     { { 0xDC00, 0xD800, 0xDC00, 0xD800 },
       { char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80), char(0xEF), char(0xBF), char(0xBD) } },
     { { 0xDC00, 0xD800, 0xD800, 0xDC00 },
       { char(0xEF), char(0xBF), char(0xBD), char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80) } },
   };
 
 static const UTFStringsStringPair Invalid8Strings[] =
   {
-    { { 'a', 0xFFFD, 'b' },
+    { { 'a', 0xFFFD, 0xFFFD, 'b' },
       { 'a', char(0xC0), char(0x80), 'b' } },
-    { { 0xFFFD, 0x80 },
+    { { 0xFFFD, 0xFFFD, 0x80 },
       { char(0xC1), char(0xBF), char(0xC2), char(0x80) } },
-    { { 0xFFFD },
+    { { 0xFFFD, 0xFFFD },
       { char(0xC1), char(0xBF) } },
-    { { 0xFFFD, 'x', 0x0800 },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0x0800 },
       { char(0xE0), char(0x80), char(0x80), 'x', char(0xE0), char(0xA0), char(0x80) } },
-    { { 0xFFFD, 'x', 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xF0), char(0x80), char(0x80), char(0x80), 'x', char(0xF0), char(0x80), char(0x8F), char(0x80) } },
-    { { 0xFFFD, 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xF4), char(0x90), char(0x80), char(0x80), char(0xF7), char(0xBF), char(0xBF), char(0xBF) } },
-    { { 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xF0), char(0x8F), char(0xBF), char(0xBF), 'x', char(0xF0), char(0x90), char(0x80), char(0x80), char(0xF0), char(0x8F), char(0xBF), char(0xBF) } },
-    { { 0xFFFD, 'x', 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xF8), char(0x80), char(0x80), char(0x80), char(0x80), 'x', char(0xF8), char(0x88), char(0x80), char(0x80), char(0x80) } },
-    { { 0xFFFD, 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xFB), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xFC), char(0xA0), char(0x80), char(0x80), char(0x80), char(0x80) } },
-    { { 0xFFFD, 0xFFFD },
+    { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
       { char(0xFC), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0xFD), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xBF) } },
   };
 
 // Don't use this array in debug builds as that intentionally asserts.
-#ifndef DEBUG
 static const char Malformed8Strings[][16] =
   {
     { char(0x80) },
     { 'a', char(0xC8), 'c' },
     { 'a', char(0xC0) },
     { 'a', char(0xE8), 'c' },
     { 'a', char(0xE8), char(0x80), 'c' },
     { 'a', char(0xE8), char(0x80) },
@@ -102,11 +101,10 @@ static const char Malformed8Strings[][16
     { 'a', char(0xFA), 'c' },
     { 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' },
     { 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
     { 'a', char(0xFD) },
     { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
     { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) },
     { 'a', char(0xFC), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' },
   };
-#endif
 
 #endif