author | Henri Sivonen <hsivonen@hsivonen.fi> |
Wed, 21 Mar 2018 12:26:41 +0200 | |
changeset 771518 | ece6c9742bb30761c697ebbe17d3f0881c4fb14d |
parent 770054 | 38feba9d243f4c1a0c9bfcd972a01b6399f2ecad |
push id | 103704 |
push user | bmo:hsivonen@hsivonen.fi |
push date | Fri, 23 Mar 2018 09:24:55 +0000 |
bugs | 1402247 |
milestone | 61.0a1 |
--- a/servo/support/gecko/nsstring/src/conversions.rs +++ b/servo/support/gecko/nsstring/src/conversions.rs @@ -247,17 +247,17 @@ impl nsACString { if needed <= available { written = num_ascii + convert_utf16_to_utf8(&other[num_ascii..], &mut buffer[filled..]); (0, 0, 0) } else { (needed, filled, num_ascii) } } else { let needed = times_three_plus_one(other.len()).ok_or(())?; - (needed, 0, 0) + (needed, old_len, 0) }; if needed != 0 { let buffer = unsafe { self.fallible_maybe_expand_capacity(filled.checked_add(needed).ok_or(())?)? }; written = num_ascii + convert_utf16_to_utf8(&other[num_ascii..], &mut buffer[filled..]); } } else {
--- a/xpcom/ds/nsAtomTable.cpp +++ b/xpcom/ds/nsAtomTable.cpp @@ -204,28 +204,22 @@ struct AtomTableKey : mUTF16String(aUTF16String) , mUTF8String(nullptr) , mLength(aLength) { mHash = HashString(mUTF16String, mLength); *aHashOut = mHash; } - AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut) + AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut, bool* aErr) : mUTF16String(nullptr) , mUTF8String(aUTF8String) , mLength(aLength) { - bool err; - mHash = HashUTF8AsUTF16(mUTF8String, mLength, &err); - if (err) { - mUTF8String = nullptr; - mLength = 0; - mHash = 0; - } + mHash = HashUTF8AsUTF16(mUTF8String, mLength, aErr); *aHashOut = mHash; } const char16_t* mUTF16String; const char* mUTF8String; uint32_t mLength; uint32_t mHash; }; @@ -725,36 +719,41 @@ NS_Atomize(const char* aUTF8String) MOZ_ASSERT(gAtomTable); return gAtomTable->Atomize(nsDependentCString(aUTF8String)); } already_AddRefed<nsAtom> nsAtomTable::Atomize(const nsACString& aUTF8String) { uint32_t hash; - AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash); + bool err; + AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash, &err); + if (MOZ_UNLIKELY(err)) { + MOZ_ASSERT_UNREACHABLE("Tried to atomize invalid UTF-8."); + // The input was invalid UTF-8. Let's replace the errors with U+FFFD + // and atomize the result. + nsString str; + CopyUTF8toUTF16(aUTF8String, str); + return Atomize(str); + } nsAtomSubTable& table = SelectSubTable(key); MutexAutoLock lock(table.mLock); AtomTableEntry* he = table.Add(key); if (he->mAtom) { RefPtr<nsAtom> atom = he->mAtom; return atom.forget(); } // This results in an extra addref/release of the nsStringBuffer. // Unfortunately there doesn't seem to be any APIs to avoid that. // Actually, now there is, sort of: ForgetSharedBuffer. nsString str; - // If the input was invalid UTF-8, hash is zero and we atomicize - // the empty string - if (hash) { - CopyUTF8toUTF16(aUTF8String, str); - } + CopyUTF8toUTF16(aUTF8String, str); RefPtr<nsAtom> atom = dont_AddRef(new nsDynamicAtom(str, hash)); he->mAtom = atom; return atom.forget(); } already_AddRefed<nsAtom>
--- a/xpcom/string/nsReadableUtils.cpp +++ b/xpcom/string/nsReadableUtils.cpp @@ -641,41 +641,37 @@ CompareUTF8toUTF16(const nsACString& aUT aUTF8String.BeginReading(u8); aUTF8String.EndReading(u8end); const char16_t* u16; const char16_t* u16end; aUTF16String.BeginReading(u16); aUTF16String.EndReading(u16end); - bool err = false; for (;;) { if (u8 == u8end) { if (u16 == u16end) { return 0; } return -1; } if (u16 == u16end) { return 1; } // No need for ASCII optimization, since both NextChar() - // calls get inlined. The calls below never set err to false, - // so it's OK not to check between the two calls. - uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end, &err); - uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end, &err); - if (err) { - return INT32_MIN; + // calls get inlined. + uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end); + uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end); + if (scalar16 == scalar8) { + continue; } if (scalar8 < scalar16) { return -1; } - if (scalar16 > scalar8) { - return 1; - } + return 1; } } void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest) { NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char"); if (IS_IN_BMP(aSource)) {
--- a/xpcom/string/nsReadableUtils.h +++ b/xpcom/string/nsReadableUtils.h @@ -566,19 +566,19 @@ const nsCString& EmptyCString(); const nsString& VoidString(); const nsCString& VoidCString(); /** * Compare a UTF-8 string to an UTF-16 string. * * Returns 0 if the strings are equal, -1 if aUTF8String is less -* than aUTF16Count, and 1 in the reverse case. In case of fatal -* error (eg the strings are not valid UTF8 and UTF16 respectively), -* this method will return INT32_MIN. +* than aUTF16Count, and 1 in the reverse case. Errors are replaced +* with U+FFFD and then the U+FFFD is compared as if it had occurred +* in the input. */ int32_t CompareUTF8toUTF16(const nsACString& aUTF8String, const nsAString& aUTF16String); void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest); template<class T> inline bool
--- a/xpcom/string/nsUTF8Utils.h +++ b/xpcom/string/nsUTF8Utils.h @@ -50,32 +50,34 @@ public: /** * Extract the next Unicode scalar value from the buffer and return it. The * pointer passed in is advanced to the start of the next character in the * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced * over the maximal valid prefix and *aErr is set to true (if aErr is not * null). * - * Precondition: *aBuffer < aEnd; *aErr == false (if aErr is not null) + * Note: This method never sets *aErr to false to allow error accumulation + * across multiple calls. + * + * Precondition: *aBuffer < aEnd */ class UTF8CharEnumerator { public: static inline uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr = nullptr) { MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); MOZ_ASSERT(aEnd, "null end pointer"); const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer); const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd); MOZ_ASSERT(p, "null buffer"); MOZ_ASSERT(p < end, "Bogus range"); - MOZ_ASSERT_IF(aErr, !*aErr); unsigned char first = *p++; if (MOZ_LIKELY(first < 0x80U)) { *aBuffer = reinterpret_cast<const char*>(p); return first; } @@ -159,31 +161,33 @@ public: }; /** * Extract the next Unicode scalar value from the buffer and return it. The * pointer passed in is advanced to the start of the next character in the * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over * the unpaired surrogate and *aErr is set to true (if aErr is not null). * + * Note: This method never sets *aErr to false to allow error accumulation + * across multiple calls. + * * Precondition: *aBuffer < aEnd */ class UTF16CharEnumerator { public: static inline uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd, bool* aErr = nullptr) { MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); MOZ_ASSERT(aEnd, "null end pointer"); const char16_t* p = *aBuffer; MOZ_ASSERT(p, "null buffer"); MOZ_ASSERT(p < aEnd, "Bogus range"); - MOZ_ASSERT_IF(aErr, !*aErr); char16_t c = *p++; // Let's use encoding_rs-style code golf here. // Unsigned underflow is defined behavior char16_t cMinusSurrogateStart = c - 0xD800U; if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) { *aBuffer = p;
--- a/xpcom/tests/gtest/TestAtoms.cpp +++ b/xpcom/tests/gtest/TestAtoms.cpp @@ -89,38 +89,42 @@ TEST(Atoms, Invalid) { RefPtr<nsAtom> atom16 = NS_Atomize(Invalid16Strings[i].m16); EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid16Strings[i].m16))); } EXPECT_EQ(count, NS_GetNumberOfAtoms()); } +// Don't run this test in debug builds as that intentionally asserts. +#ifndef DEBUG for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) { nsrefcnt count = NS_GetNumberOfAtoms(); { RefPtr<nsAtom> atom8 = NS_Atomize(Invalid8Strings[i].m8); RefPtr<nsAtom> atom16 = NS_Atomize(Invalid8Strings[i].m16); EXPECT_EQ(atom16, atom8); EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid8Strings[i].m16))); } EXPECT_EQ(count, NS_GetNumberOfAtoms()); } -// Don't run this test in debug builds as that intentionally asserts. -#ifndef DEBUG - RefPtr<nsAtom> emptyAtom = NS_Atomize(""); - for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) { nsrefcnt count = NS_GetNumberOfAtoms(); - RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]); - EXPECT_EQ(atom8, emptyAtom); + { + RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]); + nsString str16; + CopyUTF8toUTF16(MakeStringSpan(Malformed8Strings[i]), str16); + RefPtr<nsAtom> atom16 = NS_Atomize(str16); + EXPECT_EQ(atom8, atom16); + } + EXPECT_EQ(count, NS_GetNumberOfAtoms()); } #endif } #define FIRST_ATOM_STR "first static atom. Hello!" #define SECOND_ATOM_STR "second static atom. @World!" #define THIRD_ATOM_STR "third static atom?!"
--- a/xpcom/tests/gtest/TestTextFormatter.cpp +++ b/xpcom/tests/gtest/TestTextFormatter.cpp @@ -12,17 +12,16 @@ TEST(TextFormatter, Tests) nsAutoString fmt(NS_LITERAL_STRING("%3$s %4$S %1$d %2$d %2$d %3$s")); char utf8[] = "Hello"; char16_t ucs2[]={'W', 'o', 'r', 'l', 'd', 0x4e00, 0xAc00, 0xFF45, 0x0103, 0x00}; int d=3; char16_t buf[256]; nsTextFormatter::snprintf(buf, 256, fmt.get(), d, 333, utf8, ucs2); nsAutoString out(buf); - ASSERT_STREQ("Hello World", NS_LossyConvertUTF16toASCII(out).get()); const char16_t *uout = out.get(); const char16_t expected[] = {0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64, 0x4E00, 0xAC00, 0xFF45, 0x0103, 0x20, 0x33, 0x20, 0x33, 0x33, 0x33, 0x20, 0x33, 0x33, 0x33, 0x20, 0x48, 0x65, 0x6C, 0x6C, 0x6F};
--- a/xpcom/tests/gtest/TestUTF.cpp +++ b/xpcom/tests/gtest/TestUTF.cpp @@ -102,30 +102,26 @@ TEST(UTF, Hash16) EXPECT_EQ(HashString(ValidStrings[i].m16), HashUTF8AsUTF16(str8.get(), str8.Length(), &err)); EXPECT_FALSE(err); } for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) { nsDependentCString str8(Invalid8Strings[i].m8); bool err; - EXPECT_EQ(HashString(Invalid8Strings[i].m16), - HashUTF8AsUTF16(str8.get(), str8.Length(), &err)); - EXPECT_FALSE(err); + EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u); + EXPECT_TRUE(err); } -// Don't run this test in debug builds as that intentionally asserts. -#ifndef DEBUG for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) { nsDependentCString str8(Malformed8Strings[i]); bool err; EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u); EXPECT_TRUE(err); } -#endif } /** * This tests the handling of a non-ascii character at various locations in a * UTF-16 string that is being converted to UTF-8. */ void NonASCII16_helper(const size_t aStrSize) {
--- a/xpcom/tests/gtest/UTFStrings.h +++ b/xpcom/tests/gtest/UTFStrings.h @@ -56,40 +56,39 @@ static const UTFStringsStringPair Invali { { 0xDC00, 0xD800, 0xDC00, 0xD800 }, { char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80), char(0xEF), char(0xBF), char(0xBD) } }, { { 0xDC00, 0xD800, 0xD800, 0xDC00 }, { char(0xEF), char(0xBF), char(0xBD), char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80) } }, }; static const UTFStringsStringPair Invalid8Strings[] = { - { { 'a', 0xFFFD, 'b' }, + { { 'a', 0xFFFD, 0xFFFD, 'b' }, { 'a', char(0xC0), char(0x80), 'b' } }, - { { 0xFFFD, 0x80 }, + { { 0xFFFD, 0xFFFD, 0x80 }, { char(0xC1), char(0xBF), char(0xC2), char(0x80) } }, - { { 0xFFFD }, + { { 0xFFFD, 0xFFFD }, { char(0xC1), char(0xBF) } }, - { { 0xFFFD, 'x', 0x0800 }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0x0800 }, { char(0xE0), char(0x80), char(0x80), 'x', char(0xE0), char(0xA0), char(0x80) } }, - { { 0xFFFD, 'x', 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xF0), char(0x80), char(0x80), char(0x80), 'x', char(0xF0), char(0x80), char(0x8F), char(0x80) } }, - { { 0xFFFD, 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xF4), char(0x90), char(0x80), char(0x80), char(0xF7), char(0xBF), char(0xBF), char(0xBF) } }, - { { 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xF0), char(0x8F), char(0xBF), char(0xBF), 'x', char(0xF0), char(0x90), char(0x80), char(0x80), char(0xF0), char(0x8F), char(0xBF), char(0xBF) } }, - { { 0xFFFD, 'x', 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xF8), char(0x80), char(0x80), char(0x80), char(0x80), 'x', char(0xF8), char(0x88), char(0x80), char(0x80), char(0x80) } }, - { { 0xFFFD, 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xFB), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xFC), char(0xA0), char(0x80), char(0x80), char(0x80), char(0x80) } }, - { { 0xFFFD, 0xFFFD }, + { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD }, { char(0xFC), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0xFD), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xBF) } }, }; // Don't use this array in debug builds as that intentionally asserts. -#ifndef DEBUG static const char Malformed8Strings[][16] = { { char(0x80) }, { 'a', char(0xC8), 'c' }, { 'a', char(0xC0) }, { 'a', char(0xE8), 'c' }, { 'a', char(0xE8), char(0x80), 'c' }, { 'a', char(0xE8), char(0x80) }, @@ -102,11 +101,10 @@ static const char Malformed8Strings[][16 { 'a', char(0xFA), 'c' }, { 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' }, { 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' }, { 'a', char(0xFD) }, { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' }, { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) }, { 'a', char(0xFC), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' }, }; -#endif #endif