Bug 1402247 part 10 - Make UTF8CharEnumerator::NextChar() consistent with UTF-8 to UTF-16 string conversion. draft
authorHenri Sivonen <hsivonen@hsivonen.fi>
Tue, 20 Mar 2018 18:01:54 +0200
changeset 770054 38feba9d243f4c1a0c9bfcd972a01b6399f2ecad
parent 769235 ec1b44c638a8d2376baffe0e34dd796a37262260
child 770581 c0da06689514db3c98ffb158965d76b46a3458f7
child 771518 ece6c9742bb30761c697ebbe17d3f0881c4fb14d
push id103299
push userbmo:hsivonen@hsivonen.fi
push dateTue, 20 Mar 2018 16:02:59 +0000
bugs1402247
milestone61.0a1
Bug 1402247 part 10 - Make UTF8CharEnumerator::NextChar() consistent with UTF-8 to UTF-16 string conversion. MozReview-Commit-ID: 7lA6EVNX3Fy
intl/lwbrk/nsPangoBreaker.cpp
xpcom/ds/nsAtomTable.cpp
xpcom/string/nsReadableUtils.cpp
xpcom/string/nsUTF8Utils.h
xpcom/tests/gtest/TestUTF.cpp
--- a/intl/lwbrk/nsPangoBreaker.cpp
+++ b/intl/lwbrk/nsPangoBreaker.cpp
@@ -37,21 +37,23 @@ NS_GetComplexLineBreaks(const char16_t* 
 
     while (p < end)
     {
       aBreakBefore[u16Offset] = attr->is_line_break;
       if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
         aBreakBefore[++u16Offset] = false; // Skip high surrogate
       ++u16Offset;
 
-      bool err;
-      uint32_t ch = UTF8CharEnumerator::NextChar(&p, end, &err);
+      // We're iterating over text obtained from NS_ConvertUTF16toUTF8,
+      // so we know we have valid UTF-8 and don't need to check for
+      // errors.
+      uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
       ++attr;
 
-      if (ch == 0 || err) {
+      if (!ch) {
         // pango_break (pango 1.16.2) only analyses text before the
         // first NUL (but sets one extra attr). Workaround loop to call
         // pango_break again to analyse after the NUL is done somewhere else
         // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
         // So, we do the same here for pango_get_log_attrs.
         break;
       }
     }
--- a/xpcom/ds/nsAtomTable.cpp
+++ b/xpcom/ds/nsAtomTable.cpp
@@ -740,17 +740,21 @@ nsAtomTable::Atomize(const nsACString& a
 
     return atom.forget();
   }
 
   // This results in an extra addref/release of the nsStringBuffer.
   // Unfortunately there doesn't seem to be any APIs to avoid that.
   // Actually, now there is, sort of: ForgetSharedBuffer.
   nsString str;
-  CopyUTF8toUTF16(aUTF8String, str);
+  // If the input was invalid UTF-8, hash is zero and we atomicize
+  // the empty string
+  if (hash) {
+    CopyUTF8toUTF16(aUTF8String, str);
+  }
   RefPtr<nsAtom> atom = dont_AddRef(new nsDynamicAtom(str, hash));
 
   he->mAtom = atom;
 
   return atom.forget();
 }
 
 already_AddRefed<nsAtom>
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -631,86 +631,52 @@ VoidCString()
 
   return sNull;
 }
 
 int32_t
 CompareUTF8toUTF16(const nsACString& aUTF8String,
                    const nsAString& aUTF16String)
 {
-  static const uint32_t NOT_ASCII = uint32_t(~0x7F);
-
   const char* u8;
   const char* u8end;
   aUTF8String.BeginReading(u8);
   aUTF8String.EndReading(u8end);
 
   const char16_t* u16;
   const char16_t* u16end;
   aUTF16String.BeginReading(u16);
   aUTF16String.EndReading(u16end);
 
-  while (u8 != u8end && u16 != u16end) {
-    // Cast away the signedness of *u8 to prevent signextension when
-    // converting to uint32_t
-    uint32_t c8_32 = (uint8_t)*u8;
-
-    if (c8_32 & NOT_ASCII) {
-      bool err;
-      c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
-      if (err) {
-        return INT32_MIN;
+  bool err = false;
+  for (;;) {
+    if (u8 == u8end) {
+      if (u16 == u16end) {
+        return 0;
       }
-
-      uint32_t c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end);
-      // The above UTF16CharEnumerator::NextChar() calls can
-      // fail, but if it does for anything other than no data to
-      // look at (which can't happen here), it returns the
-      // Unicode replacement character 0xFFFD for the invalid
-      // data they were fed. Ignore that error and treat invalid
-      // UTF16 as 0xFFFD.
-      //
-      // This matches what our UTF16 to UTF8 conversion code
-      // does, and thus a UTF8 string that came from an invalid
-      // UTF16 string will compare equal to the invalid UTF16
-      // string it came from. Same is true for any other UTF16
-      // string differs only in the invalid part of the string.
-
-      if (c8_32 != c16_32) {
-        return c8_32 < c16_32 ? -1 : 1;
-      }
-    } else {
-      if (c8_32 != *u16) {
-        return c8_32 > *u16 ? 1 : -1;
-      }
-
-      ++u8;
-      ++u16;
+      return -1;
+    }
+    if (u16 == u16end) {
+      return 1;
+    }
+    // No need for ASCII optimization, since both NextChar()
+    // calls get inlined. The calls below never set err to false,
+    // so it's OK not to check between the two calls.
+    uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
+    uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end, &err);
+    if (err) {
+      return INT32_MIN;
+    }
+    if (scalar8 < scalar16) {
+      return -1;
+    }
+    if (scalar16 > scalar8) {
+      return 1;
     }
   }
-
-  if (u8 != u8end) {
-    // We get to the end of the UTF16 string, but no to the end of
-    // the UTF8 string. The UTF8 string is longer than the UTF16
-    // string
-
-    return 1;
-  }
-
-  if (u16 != u16end) {
-    // We get to the end of the UTF8 string, but no to the end of
-    // the UTF16 string. The UTF16 string is longer than the UTF8
-    // string
-
-    return -1;
-  }
-
-  // The two strings match.
-
-  return 0;
 }
 
 void
 AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest)
 {
   NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char");
   if (IS_IN_BMP(aSource)) {
     aDest.Append(char16_t(aSource));
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -41,243 +41,176 @@ public:
   static bool is3byte(char aChar)
   {
     return (aChar & 0xF0) == 0xE0;
   }
   static bool is4byte(char aChar)
   {
     return (aChar & 0xF8) == 0xF0;
   }
-  static bool is5byte(char aChar)
+};
+
+/**
+ * Extract the next Unicode scalar value from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
+ * over the maximal valid prefix and *aErr is set to true (if aErr is not
+ * null).
+ *
+ * Precondition: *aBuffer < aEnd; *aErr == false (if aErr is not null)
+ */
+class UTF8CharEnumerator
+{
+public:
+  static inline uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr = nullptr)
   {
-    return (aChar & 0xFC) == 0xF8;
-  }
-  static bool is6byte(char aChar)
-  {
-    return (aChar & 0xFE) == 0xFC;
+    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+    MOZ_ASSERT(aEnd, "null end pointer");
+
+    const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
+    const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
+
+    MOZ_ASSERT(p, "null buffer");
+    MOZ_ASSERT(p < end, "Bogus range");
+    MOZ_ASSERT_IF(aErr, !*aErr);
+
+    unsigned char first = *p++;
+
+    if (MOZ_LIKELY(first < 0x80U)) {
+      *aBuffer = reinterpret_cast<const char*>(p);
+      return first;
+    }
+
+    // Unsigned underflow is defined behavior
+    if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    unsigned char second = *p;
+
+    if (first < 0xE0U) {
+      // Two-byte
+      if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
+        *aBuffer = reinterpret_cast<const char*>(++p);
+        return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
+      }
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    if (MOZ_LIKELY(first < 0xF0U)) {
+      // Three-byte
+      unsigned char lower = 0x80U;
+      unsigned char upper = 0xBFU;
+      if (first == 0xE0U) {
+        lower = 0xA0U;
+      } else if (first == 0xEDU) {
+        upper = 0x9FU;
+      }
+      if (MOZ_LIKELY(second >= lower && second <= upper)) {
+        if (MOZ_LIKELY(p != end)) {
+          unsigned char third = *++p;
+          if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+            *aBuffer = reinterpret_cast<const char*>(++p);
+            return ((uint32_t(first) & 0xFU) << 12) | ((uint32_t(second) & 0x3FU) << 6) | (uint32_t(third) & 0x3FU);
+          }
+        }
+      }
+      *aBuffer = reinterpret_cast<const char*>(p);
+      if (aErr) {
+        *aErr = true;
+      }
+      return 0xFFFDU;
+    }
+
+    // Four-byte
+    unsigned char lower = 0x80U;
+    unsigned char upper = 0xBFU;
+    if (first == 0xF0U) {
+      lower = 0x90U;
+    } else if (first == 0xF4U) {
+      upper = 0x8FU;
+    }
+    if (MOZ_LIKELY(second >= lower && second <= upper)) {
+      if (MOZ_LIKELY(p != end)) {
+        unsigned char third = *++p;
+        if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+          if (MOZ_LIKELY(p != end)) {
+            unsigned char fourth = *++p;
+            if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
+              *aBuffer = reinterpret_cast<const char*>(++p);
+              return ((uint32_t(first) & 0x7U) << 18) | ((uint32_t(second) & 0x3FU) << 12) | ((uint32_t(third) & 0x3FU) << 6) | (uint32_t(fourth) & 0x3FU);
+            }
+          }
+        }
+      }
+    }
+    *aBuffer = reinterpret_cast<const char*>(p);
+    if (aErr) {
+      *aErr = true;
+    }
+    return 0xFFFDU;
   }
 };
 
 /**
- * Extract the next UCS-4 character from the buffer and return it.  The
+ * Extract the next Unicode scalar value from the buffer and return it. The
  * pointer passed in is advanced to the start of the next character in the
- * buffer.  If non-null, the parameters err and overlong are filled in to
- * indicate that the character was represented by an overlong sequence, or
- * that an error occurred.
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
+ * the unpaired surrogate and *aErr is set to true (if aErr is not null).
+ *
+ * Precondition: *aBuffer < aEnd
  */
-
-class UTF8CharEnumerator
+class UTF16CharEnumerator
 {
 public:
-  static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
+  static inline uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd, bool* aErr = nullptr)
   {
-    NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
+    MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+    MOZ_ASSERT(aEnd, "null end pointer");
 
-    const char* p = *aBuffer;
-    *aErr = false;
+    const char16_t* p = *aBuffer;
 
-    if (p >= aEnd) {
-      *aErr = true;
+    MOZ_ASSERT(p, "null buffer");
+    MOZ_ASSERT(p < aEnd, "Bogus range");
+    MOZ_ASSERT_IF(aErr, !*aErr);
 
-      return 0;
-    }
+    char16_t c = *p++;
 
-    char c = *p++;
-
-    if (UTF8traits::isASCII(c)) {
+    // Let's use encoding_rs-style code golf here.
+    // Unsigned underflow is defined behavior
+    char16_t cMinusSurrogateStart = c - 0xD800U;
+    if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
       *aBuffer = p;
       return c;
     }
-
-    uint32_t ucs4;
-    uint32_t minUcs4;
-    int32_t state = 0;
-
-    if (!CalcState(c, ucs4, minUcs4, state)) {
-      NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-      *aErr = true;
-
-      return 0;
-    }
-
-    while (state--) {
-      if (p == aEnd) {
-        *aErr = true;
-
-        return 0;
-      }
-
-      c = *p++;
-
-      if (!AddByte(c, state, ucs4)) {
-        *aErr = true;
-
-        return 0;
+    if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
+      // High surrogate
+      if (MOZ_LIKELY(p != aEnd)) {
+        char16_t second = *p;
+        // Unsigned underflow is defined behavior
+        if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
+          *aBuffer = ++p;
+          return (uint32_t(c) << 10) + uint32_t(second) - (((0xD800U << 10) - 0x10000U) + 0xDC00U);
+        }
       }
     }
-
-    if (ucs4 < minUcs4) {
-      // Overlong sequence
-      ucs4 = UCS2_REPLACEMENT_CHAR;
-    } else if (ucs4 >= 0xD800 &&
-               (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
-      // Surrogates and code points outside the Unicode range.
-      ucs4 = UCS2_REPLACEMENT_CHAR;
-    }
-
+    // Unpaired surrogate
     *aBuffer = p;
-    return ucs4;
-  }
-
-private:
-  static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
-                        int32_t& aState)
-  {
-    if (UTF8traits::is2byte(aChar)) {
-      aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
-      aState = 1;
-      aMinUcs4 = 0x00000080;
-    } else if (UTF8traits::is3byte(aChar)) {
-      aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
-      aState = 2;
-      aMinUcs4 = 0x00000800;
-    } else if (UTF8traits::is4byte(aChar)) {
-      aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
-      aState = 3;
-      aMinUcs4 = 0x00010000;
-    } else if (UTF8traits::is5byte(aChar)) {
-      aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
-      aState = 4;
-      aMinUcs4 = 0x00200000;
-    } else if (UTF8traits::is6byte(aChar)) {
-      aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
-      aState = 5;
-      aMinUcs4 = 0x04000000;
-    } else {
-      return false;
-    }
-
-    return true;
-  }
-
-  static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
-  {
-    if (UTF8traits::isInSeq(aChar)) {
-      int32_t shift = aState * 6;
-      aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
-      return true;
+    if (aErr) {
+      *aErr = true;
     }
-
-    return false;
-  }
-};
-
-
-/**
- * Extract the next UCS-4 character from the buffer and return it.  The
- * pointer passed in is advanced to the start of the next character in the
- * buffer.  If non-null, the err parameter is filled in if an error occurs.
- *
- * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
- * the buffer will be updated to move only a single UCS-2 character.
- *
- * Any other error returns 0 and does not move the buffer position.
- */
-
-
-class UTF16CharEnumerator
-{
-public:
-  static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
-                           bool* aErr = nullptr)
-  {
-    NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
-
-    const char16_t* p = *aBuffer;
-
-    if (p >= aEnd) {
-      NS_ERROR("No input to work with");
-      if (aErr) {
-        *aErr = true;
-      }
-
-      return 0;
-    }
-
-    char16_t c = *p++;
-
-    if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
-      if (aErr) {
-        *aErr = false;
-      }
-      *aBuffer = p;
-      return c;
-    } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
-      if (p == aEnd) {
-        // Found a high surrogate at the end of the buffer. Flag this
-        // as an error and return the Unicode replacement
-        // character 0xFFFD.
-
-        UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
-
-        if (aErr) {
-          *aErr = true;
-        }
-        *aBuffer = p;
-        return 0xFFFD;
-      }
-
-      // D800- DBFF - High Surrogate
-      char16_t h = c;
-
-      c = *p++;
-
-      if (NS_IS_LOW_SURROGATE(c)) {
-        // DC00- DFFF - Low Surrogate
-        // N = (H - D800) *400 + 10000 + (L - DC00)
-        uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
-        if (aErr) {
-          *aErr = false;
-        }
-        *aBuffer = p;
-        return ucs4;
-      } else {
-        // Found a high surrogate followed by something other than
-        // a low surrogate. Flag this as an error and return the
-        // Unicode replacement character 0xFFFD.  Note that the
-        // pointer to the next character points to the second 16-bit
-        // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
-        // only the first code unit of an illegal sequence must be
-        // treated as an illegally terminated code unit sequence
-        // (also Chapter 3 D91, "isolated [not paired and ill-formed]
-        // UTF-16 code units in the range D800..DFFF are ill-formed").
-        UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
-
-        if (aErr) {
-          *aErr = true;
-        }
-        *aBuffer = p - 1;
-        return 0xFFFD;
-      }
-    } else { // U+DC00 - U+DFFF
-      // DC00- DFFF - Low Surrogate
-
-      // Found a low surrogate w/o a preceding high surrogate. Flag
-      // this as an error and return the Unicode replacement
-      // character 0xFFFD.
-
-      UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
-      if (aErr) {
-        *aErr = true;
-      }
-      *aBuffer = p;
-      return 0xFFFD;
-    }
-
-    MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
+    return 0xFFFDU;
   }
 };
 
 template<typename Char, typename UnsignedT>
 inline UnsignedT
 RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
 {
   static_assert(mozilla::IsSame<Char, char>::value ||
--- a/xpcom/tests/gtest/TestUTF.cpp
+++ b/xpcom/tests/gtest/TestUTF.cpp
@@ -9,16 +9,17 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "nsString.h"
 #include "nsStringBuffer.h"
 #include "nsReadableUtils.h"
 #include "UTFStrings.h"
 #include "nsUnicharUtils.h"
 #include "mozilla/HashFunctions.h"
+#include "nsUTF8Utils.h"
 
 #include "gtest/gtest.h"
 
 using namespace mozilla;
 
 namespace TestUTF {
 
 TEST(UTF, Valid)
@@ -173,19 +174,81 @@ void NonASCII16_helper(const size_t aStr
 
     // And finish with the trailing ASCII chars.
     expected.Append(asciiCString.BeginReading() + i + 1, kTestSize - i - 1);
 
     EXPECT_STREQ(dest.BeginReading(), expected.BeginReading());
   }
 }
 
-TEST(UTF, NonASCII16)
+TEST(UTF, UTF8CharEnumerator)
 {
-  // Test with various string sizes to catch any special casing.
-  NonASCII16_helper(1);
-  NonASCII16_helper(8);
-  NonASCII16_helper(16);
-  NonASCII16_helper(32);
-  NonASCII16_helper(512);
+  const char* p = "\x61\xC0\xC2\xC2\x80\xE0\x80\x80\xE0\xA0\x80\xE1\x80\x80\xED\xBF\xBF\xED\x9F\xBF\xEE\x80\x80\xEE\x80\xFF\xF0\x90\x80\x80\xF0\x80\x80\x80\xF1\x80\x80\x80\xF4\x8F\xBF\xF4\x8F\xBF\xBF\xF4\xBF\xBF\xBF";
+  const char* end = p + 49;
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0061U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0080U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0800U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x1000U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xD7FFU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xE000U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10000U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x40000U);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10FFFFU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+  p = "\xC2";
+  end = p + 1;
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+  p = "\xE1\x80";
+  end = p + 2;
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+  p = "\xF1\x80\x80";
+  end = p + 3;
+  EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+}
+
+TEST(UTF, UTF16CharEnumerator)
+{
+  const char16_t* p = u"\u0061\U0001F4A9";
+  const char16_t* end = p + 3;
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x1F4A9U);
+  EXPECT_EQ(p, end);
+  const char16_t loneHigh = 0xD83D;
+  p = &loneHigh;
+  end = p + 1;
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+  const char16_t loneLow = 0xDCA9;
+  p = &loneLow;
+  end = p + 1;
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(p, end);
+  const char16_t loneHighStr[] = { 0xD83D, 0x0061 };
+  p = loneHighStr;
+  end = p + 2;
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+  EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+  EXPECT_EQ(p, end);
 }
 
 } // namespace TestUTF