Bug 1402247 part 10 - Make UTF8CharEnumerator::NextChar() consistent with UTF-8 to UTF-16 string conversion.
MozReview-Commit-ID: 7lA6EVNX3Fy
--- a/intl/lwbrk/nsPangoBreaker.cpp
+++ b/intl/lwbrk/nsPangoBreaker.cpp
@@ -37,21 +37,23 @@ NS_GetComplexLineBreaks(const char16_t*
while (p < end)
{
aBreakBefore[u16Offset] = attr->is_line_break;
if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
aBreakBefore[++u16Offset] = false; // Skip high surrogate
++u16Offset;
- bool err;
- uint32_t ch = UTF8CharEnumerator::NextChar(&p, end, &err);
+ // We're iterating over text obtained from NS_ConvertUTF16toUTF8,
+ // so we know we have valid UTF-8 and don't need to check for
+ // errors.
+ uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
++attr;
- if (ch == 0 || err) {
+ if (!ch) {
// pango_break (pango 1.16.2) only analyses text before the
// first NUL (but sets one extra attr). Workaround loop to call
// pango_break again to analyse after the NUL is done somewhere else
// (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
// So, we do the same here for pango_get_log_attrs.
break;
}
}
--- a/xpcom/ds/nsAtomTable.cpp
+++ b/xpcom/ds/nsAtomTable.cpp
@@ -740,17 +740,21 @@ nsAtomTable::Atomize(const nsACString& a
return atom.forget();
}
// This results in an extra addref/release of the nsStringBuffer.
// Unfortunately there doesn't seem to be any APIs to avoid that.
// Actually, now there is, sort of: ForgetSharedBuffer.
nsString str;
- CopyUTF8toUTF16(aUTF8String, str);
+ // If the input was invalid UTF-8, hash is zero and we atomicize
+ // the empty string
+ if (hash) {
+ CopyUTF8toUTF16(aUTF8String, str);
+ }
RefPtr<nsAtom> atom = dont_AddRef(new nsDynamicAtom(str, hash));
he->mAtom = atom;
return atom.forget();
}
already_AddRefed<nsAtom>
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -631,86 +631,52 @@ VoidCString()
return sNull;
}
int32_t
CompareUTF8toUTF16(const nsACString& aUTF8String,
const nsAString& aUTF16String)
{
- static const uint32_t NOT_ASCII = uint32_t(~0x7F);
-
const char* u8;
const char* u8end;
aUTF8String.BeginReading(u8);
aUTF8String.EndReading(u8end);
const char16_t* u16;
const char16_t* u16end;
aUTF16String.BeginReading(u16);
aUTF16String.EndReading(u16end);
- while (u8 != u8end && u16 != u16end) {
- // Cast away the signedness of *u8 to prevent signextension when
- // converting to uint32_t
- uint32_t c8_32 = (uint8_t)*u8;
-
- if (c8_32 & NOT_ASCII) {
- bool err;
- c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
- if (err) {
- return INT32_MIN;
+ bool err = false;
+ for (;;) {
+ if (u8 == u8end) {
+ if (u16 == u16end) {
+ return 0;
}
-
- uint32_t c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end);
- // The above UTF16CharEnumerator::NextChar() calls can
- // fail, but if it does for anything other than no data to
- // look at (which can't happen here), it returns the
- // Unicode replacement character 0xFFFD for the invalid
- // data they were fed. Ignore that error and treat invalid
- // UTF16 as 0xFFFD.
- //
- // This matches what our UTF16 to UTF8 conversion code
- // does, and thus a UTF8 string that came from an invalid
- // UTF16 string will compare equal to the invalid UTF16
- // string it came from. Same is true for any other UTF16
- // string differs only in the invalid part of the string.
-
- if (c8_32 != c16_32) {
- return c8_32 < c16_32 ? -1 : 1;
- }
- } else {
- if (c8_32 != *u16) {
- return c8_32 > *u16 ? 1 : -1;
- }
-
- ++u8;
- ++u16;
+ return -1;
+ }
+ if (u16 == u16end) {
+ return 1;
+ }
+ // No need for ASCII optimization, since both NextChar()
+ // calls get inlined. The calls below never set err to false,
+ // so it's OK not to check between the two calls.
+ uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
+ uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end, &err);
+ if (err) {
+ return INT32_MIN;
+ }
+ if (scalar8 < scalar16) {
+ return -1;
+ }
+ if (scalar16 > scalar8) {
+ return 1;
}
}
-
- if (u8 != u8end) {
- // We get to the end of the UTF16 string, but no to the end of
- // the UTF8 string. The UTF8 string is longer than the UTF16
- // string
-
- return 1;
- }
-
- if (u16 != u16end) {
- // We get to the end of the UTF8 string, but no to the end of
- // the UTF16 string. The UTF16 string is longer than the UTF8
- // string
-
- return -1;
- }
-
- // The two strings match.
-
- return 0;
}
void
AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest)
{
NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char");
if (IS_IN_BMP(aSource)) {
aDest.Append(char16_t(aSource));
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -41,243 +41,176 @@ public:
static bool is3byte(char aChar)
{
return (aChar & 0xF0) == 0xE0;
}
static bool is4byte(char aChar)
{
return (aChar & 0xF8) == 0xF0;
}
- static bool is5byte(char aChar)
+};
+
+/**
+ * Extract the next Unicode scalar value from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
+ * over the maximal valid prefix and *aErr is set to true (if aErr is not
+ * null).
+ *
+ * Precondition: *aBuffer < aEnd; *aErr == false (if aErr is not null)
+ */
+class UTF8CharEnumerator
+{
+public:
+ static inline uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr = nullptr)
{
- return (aChar & 0xFC) == 0xF8;
- }
- static bool is6byte(char aChar)
- {
- return (aChar & 0xFE) == 0xFC;
+ MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+ MOZ_ASSERT(aEnd, "null end pointer");
+
+ const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
+ const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
+
+ MOZ_ASSERT(p, "null buffer");
+ MOZ_ASSERT(p < end, "Bogus range");
+ MOZ_ASSERT_IF(aErr, !*aErr);
+
+ unsigned char first = *p++;
+
+ if (MOZ_LIKELY(first < 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(p);
+ return first;
+ }
+
+ // Unsigned underflow is defined behavior
+ if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
+ }
+
+ unsigned char second = *p;
+
+ if (first < 0xE0U) {
+ // Two-byte
+ if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
+ }
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
+ }
+
+ if (MOZ_LIKELY(first < 0xF0U)) {
+ // Three-byte
+ unsigned char lower = 0x80U;
+ unsigned char upper = 0xBFU;
+ if (first == 0xE0U) {
+ lower = 0xA0U;
+ } else if (first == 0xEDU) {
+ upper = 0x9FU;
+ }
+ if (MOZ_LIKELY(second >= lower && second <= upper)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char third = *++p;
+ if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0xFU) << 12) | ((uint32_t(second) & 0x3FU) << 6) | (uint32_t(third) & 0x3FU);
+ }
+ }
+ }
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
+ }
+
+ // Four-byte
+ unsigned char lower = 0x80U;
+ unsigned char upper = 0xBFU;
+ if (first == 0xF0U) {
+ lower = 0x90U;
+ } else if (first == 0xF4U) {
+ upper = 0x8FU;
+ }
+ if (MOZ_LIKELY(second >= lower && second <= upper)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char third = *++p;
+ if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char fourth = *++p;
+ if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0x7U) << 18) | ((uint32_t(second) & 0x3FU) << 12) | ((uint32_t(third) & 0x3FU) << 6) | (uint32_t(fourth) & 0x3FU);
+ }
+ }
+ }
+ }
+ }
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
}
};
/**
- * Extract the next UCS-4 character from the buffer and return it. The
+ * Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the parameters err and overlong are filled in to
- * indicate that the character was represented by an overlong sequence, or
- * that an error occurred.
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
+ * the unpaired surrogate and *aErr is set to true (if aErr is not null).
+ *
+ * Precondition: *aBuffer < aEnd
*/
-
-class UTF8CharEnumerator
+class UTF16CharEnumerator
{
public:
- static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
+ static inline uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd, bool* aErr = nullptr)
{
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
+ MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+ MOZ_ASSERT(aEnd, "null end pointer");
- const char* p = *aBuffer;
- *aErr = false;
+ const char16_t* p = *aBuffer;
- if (p >= aEnd) {
- *aErr = true;
+ MOZ_ASSERT(p, "null buffer");
+ MOZ_ASSERT(p < aEnd, "Bogus range");
+ MOZ_ASSERT_IF(aErr, !*aErr);
- return 0;
- }
+ char16_t c = *p++;
- char c = *p++;
-
- if (UTF8traits::isASCII(c)) {
+ // Let's use encoding_rs-style code golf here.
+ // Unsigned underflow is defined behavior
+ char16_t cMinusSurrogateStart = c - 0xD800U;
+ if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
*aBuffer = p;
return c;
}
-
- uint32_t ucs4;
- uint32_t minUcs4;
- int32_t state = 0;
-
- if (!CalcState(c, ucs4, minUcs4, state)) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- *aErr = true;
-
- return 0;
- }
-
- while (state--) {
- if (p == aEnd) {
- *aErr = true;
-
- return 0;
- }
-
- c = *p++;
-
- if (!AddByte(c, state, ucs4)) {
- *aErr = true;
-
- return 0;
+ if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
+ // High surrogate
+ if (MOZ_LIKELY(p != aEnd)) {
+ char16_t second = *p;
+ // Unsigned underflow is defined behavior
+ if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
+ *aBuffer = ++p;
+ return (uint32_t(c) << 10) + uint32_t(second) - (((0xD800U << 10) - 0x10000U) + 0xDC00U);
+ }
}
}
-
- if (ucs4 < minUcs4) {
- // Overlong sequence
- ucs4 = UCS2_REPLACEMENT_CHAR;
- } else if (ucs4 >= 0xD800 &&
- (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
- // Surrogates and code points outside the Unicode range.
- ucs4 = UCS2_REPLACEMENT_CHAR;
- }
-
+ // Unpaired surrogate
*aBuffer = p;
- return ucs4;
- }
-
-private:
- static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
- int32_t& aState)
- {
- if (UTF8traits::is2byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
- aState = 1;
- aMinUcs4 = 0x00000080;
- } else if (UTF8traits::is3byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
- aState = 2;
- aMinUcs4 = 0x00000800;
- } else if (UTF8traits::is4byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
- aState = 3;
- aMinUcs4 = 0x00010000;
- } else if (UTF8traits::is5byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
- aState = 4;
- aMinUcs4 = 0x00200000;
- } else if (UTF8traits::is6byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
- aState = 5;
- aMinUcs4 = 0x04000000;
- } else {
- return false;
- }
-
- return true;
- }
-
- static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
- {
- if (UTF8traits::isInSeq(aChar)) {
- int32_t shift = aState * 6;
- aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
- return true;
+ if (aErr) {
+ *aErr = true;
}
-
- return false;
- }
-};
-
-
-/**
- * Extract the next UCS-4 character from the buffer and return it. The
- * pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the err parameter is filled in if an error occurs.
- *
- * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
- * the buffer will be updated to move only a single UCS-2 character.
- *
- * Any other error returns 0 and does not move the buffer position.
- */
-
-
-class UTF16CharEnumerator
-{
-public:
- static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
- bool* aErr = nullptr)
- {
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
-
- const char16_t* p = *aBuffer;
-
- if (p >= aEnd) {
- NS_ERROR("No input to work with");
- if (aErr) {
- *aErr = true;
- }
-
- return 0;
- }
-
- char16_t c = *p++;
-
- if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
- if (aErr) {
- *aErr = false;
- }
- *aBuffer = p;
- return c;
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- if (p == aEnd) {
- // Found a high surrogate at the end of the buffer. Flag this
- // as an error and return the Unicode replacement
- // character 0xFFFD.
-
- UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
-
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p;
- return 0xFFFD;
- }
-
- // D800- DBFF - High Surrogate
- char16_t h = c;
-
- c = *p++;
-
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + (L - DC00)
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
- if (aErr) {
- *aErr = false;
- }
- *aBuffer = p;
- return ucs4;
- } else {
- // Found a high surrogate followed by something other than
- // a low surrogate. Flag this as an error and return the
- // Unicode replacement character 0xFFFD. Note that the
- // pointer to the next character points to the second 16-bit
- // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must be
- // treated as an illegally terminated code unit sequence
- // (also Chapter 3 D91, "isolated [not paired and ill-formed]
- // UTF-16 code units in the range D800..DFFF are ill-formed").
- UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
-
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p - 1;
- return 0xFFFD;
- }
- } else { // U+DC00 - U+DFFF
- // DC00- DFFF - Low Surrogate
-
- // Found a low surrogate w/o a preceding high surrogate. Flag
- // this as an error and return the Unicode replacement
- // character 0xFFFD.
-
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p;
- return 0xFFFD;
- }
-
- MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
+ return 0xFFFDU;
}
};
template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
{
static_assert(mozilla::IsSame<Char, char>::value ||
--- a/xpcom/tests/gtest/TestUTF.cpp
+++ b/xpcom/tests/gtest/TestUTF.cpp
@@ -9,16 +9,17 @@
#include <stdio.h>
#include <stdlib.h>
#include "nsString.h"
#include "nsStringBuffer.h"
#include "nsReadableUtils.h"
#include "UTFStrings.h"
#include "nsUnicharUtils.h"
#include "mozilla/HashFunctions.h"
+#include "nsUTF8Utils.h"
#include "gtest/gtest.h"
using namespace mozilla;
namespace TestUTF {
TEST(UTF, Valid)
@@ -173,19 +174,81 @@ void NonASCII16_helper(const size_t aStr
// And finish with the trailing ASCII chars.
expected.Append(asciiCString.BeginReading() + i + 1, kTestSize - i - 1);
EXPECT_STREQ(dest.BeginReading(), expected.BeginReading());
}
}
-TEST(UTF, NonASCII16)
+TEST(UTF, UTF8CharEnumerator)
{
- // Test with various string sizes to catch any special casing.
- NonASCII16_helper(1);
- NonASCII16_helper(8);
- NonASCII16_helper(16);
- NonASCII16_helper(32);
- NonASCII16_helper(512);
+ const char* p = "\x61\xC0\xC2\xC2\x80\xE0\x80\x80\xE0\xA0\x80\xE1\x80\x80\xED\xBF\xBF\xED\x9F\xBF\xEE\x80\x80\xEE\x80\xFF\xF0\x90\x80\x80\xF0\x80\x80\x80\xF1\x80\x80\x80\xF4\x8F\xBF\xF4\x8F\xBF\xBF\xF4\xBF\xBF\xBF";
+ const char* end = p + 49;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0080U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0800U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x1000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xD7FFU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xE000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x40000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10FFFFU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xC2";
+ end = p + 1;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xE1\x80";
+ end = p + 2;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xF1\x80\x80";
+ end = p + 3;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+}
+
+TEST(UTF, UTF16CharEnumerator)
+{
+ const char16_t* p = u"\u0061\U0001F4A9";
+ const char16_t* end = p + 3;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x1F4A9U);
+ EXPECT_EQ(p, end);
+ const char16_t loneHigh = 0xD83D;
+ p = &loneHigh;
+ end = p + 1;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ const char16_t loneLow = 0xDCA9;
+ p = &loneLow;
+ end = p + 1;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ const char16_t loneHighStr[] = { 0xD83D, 0x0061 };
+ p = loneHighStr;
+ end = p + 2;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(p, end);
}
} // namespace TestUTF