author Henri Sivonen <hsivonen@hsivonen.fi>

Wed, 14 Mar 2018 19:22:21 +0200

changeset 768033 b8b72e2d353ba24a5efbffdab728463533b54ed3

parent 768032 162bcfce6cade5b58f29f0a289bc44355a5442fe

child 768034 c766164e384ccc9251b8efeb56a50cbe4b3e985c

push id 102784

push user bmo:hsivonen@hsivonen.fi

push date Thu, 15 Mar 2018 16:05:17 +0000

bugs 1402247

milestone 61.0a1

intl/encoding_glue/src/lib.rs file | annotate | diff | comparison | revisions

js/xpconnect/src/XPCConvert.cpp file | annotate | diff | comparison | revisions

xpcom/string/nsReadableUtils.cpp file | annotate | diff | comparison | revisions

xpcom/string/nsReadableUtils.h file | annotate | diff | comparison | revisions

xpcom/string/nsUTF8Utils.h file | annotate | diff | comparison | revisions
--- a/intl/encoding_glue/src/lib.rs
+++ b/intl/encoding_glue/src/lib.rs
@@ -597,21 +597,41 @@ pub unsafe extern "C" fn encoding_mem_is
 }
 
 #[no_mangle]
 pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
     encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
 }
 
 #[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool {
+    encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool {
+    encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool {
+    encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked(::std::slice::from_raw_parts(buffer, len)))
+}
+
+#[no_mangle]
 pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) {
     encoding_rs::mem::convert_utf16_to_latin1_lossy(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
 }
 
 #[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy(src: *const u8, src_len: usize, dst: *mut u8, dst_len: usize) -> usize {
+    encoding_rs::mem::convert_utf8_to_latin1_lossy(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
+}
+
+#[no_mangle]
 pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(src: *const u8, src_len: usize, dst: *mut u16, dst_len: usize) {
     encoding_rs::mem::convert_latin1_to_utf16(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
 }
 
 #[no_mangle]
 pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) -> usize {
     encoding_rs::mem::convert_utf16_to_utf8(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
 }
--- a/js/xpconnect/src/XPCConvert.cpp
+++ b/js/xpconnect/src/XPCConvert.cpp
@@ -258,51 +258,77 @@ XPCConvert::NativeData2JS(MutableHandleV
     {
         const nsACString* utf8String = *static_cast<const nsACString* const*>(s);
 
         if (!utf8String || utf8String->IsVoid()) {
             d.setNull();
             return true;
         }
 
-        if (utf8String->IsEmpty()) {
+        uint32_t len = utf8String->Length();
+
+        if (!len) {
             d.set(JS_GetEmptyStringValue(cx));
             return true;
         }
 
-        const uint32_t len = CalcUTF8ToUnicodeLength(*utf8String);
-        // The cString is not empty at this point, but the calculated
-        // UTF-16 length is zero, meaning no valid conversion exists.
-        if (!len)
-            return false;
-
-        const size_t buffer_size = (len + 1) * sizeof(char16_t);
-        char16_t* buffer =
-            static_cast<char16_t*>(JS_malloc(cx, buffer_size));
-        if (!buffer)
-            return false;
-
-        uint32_t copied;
-        if (!UTF8ToUnicodeBuffer(*utf8String, buffer, &copied) ||
-            len != copied) {
-            // Copy or conversion during copy failed. Did not copy the
-            // whole string.
-            JS_free(cx, buffer);
+        CheckedInt<uint32_t> allocLen(len);
+        allocLen += 1;
+        if (!allocLen.isValid()) {
             return false;
         }
 
-        // JS_NewUCString takes ownership on success, i.e. a
+        // Usage of UTF-8 in XPConnect is mostly for things that are
+        // almost always ASCII, so the inexact allocations below
+        // should be fine.
+
+        if (IsUTF8Latin1(*utf8String)) {
+            char* buffer =
+                static_cast<char*>(JS_malloc(cx, len));
+            if (!buffer) {
+                return false;
+            }
+            size_t written = LossyConvertUTF8toLatin1(*utf8String, MakeSpan(buffer, len));
+            buffer[written] = 0;
+            // JS_NewLatin1String takes ownership on success, i.e. a
+            // successful call will make it the responsiblity of the JS VM
+            // to free the buffer.
+            // written can never exceed len, so the truncation is OK.
+            JSString* str = JS_NewLatin1String(cx, reinterpret_cast<JS::Latin1Char*>(buffer), written);
+            if (!str) {
+                JS_free(cx, buffer);
+                return false;
+            }
+            d.setString(str);
+            return true;
+        }
+
+        allocLen *= sizeof(char16_t);
+        if (!allocLen.isValid()) {
+            return false;
+        }
+
+        // ConvertUTF8toUTF16Func requires the destination to be one code unit
+        // longer than the source.
+        char16_t* buffer =
+            static_cast<char16_t*>(JS_malloc(cx, (len + 1) * sizeof(char16_t)));;
+        if (!buffer) {
+            return false;
+        }
+        size_t written = ConvertUTF8toUTF16Func(*utf8String, MakeSpan(buffer, len + 1));
+        buffer[written] = 0;
+        // JS_NewUCStringDontDeflate takes ownership on success, i.e. a
         // successful call will make it the responsiblity of the JS VM
         // to free the buffer.
-        JSString* str = JS_NewUCString(cx, buffer, len);
+        // written can never exceed len + 1, so the truncation is OK.
+        JSString* str = JS_NewUCStringDontDeflate(cx, buffer, written);
         if (!str) {
             JS_free(cx, buffer);
             return false;
         }
-
         d.setString(str);
         return true;
     }
     case nsXPTType::T_CSTRING:
     {
         const nsACString* cString = *static_cast<const nsACString* const*>(s);
 
         if (!cString || cString->IsVoid()) {
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -125,41 +125,16 @@ ToNewUnicode(const nsACString& aSource)
   }
 
   auto len = aSource.Length();
   ConvertLatin1toUTF16(aSource, MakeSpan(dest, len));
   dest[len] = 0;
   return dest;
 }
 
-uint32_t
-CalcUTF8ToUnicodeLength(const nsACString& aSource)
-{
-  nsACString::const_iterator start, end;
-  CalculateUTF8Length calculator;
-  copy_string(aSource.BeginReading(start), aSource.EndReading(end),
-              calculator);
-  return calculator.Length();
-}
-
-char16_t*
-UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer,
-                    uint32_t* aUTF16Count)
-{
-  nsACString::const_iterator start, end;
-  ConvertUTF8toUTF16 converter(aBuffer);
-  copy_string(aSource.BeginReading(start),
-              aSource.EndReading(end),
-              converter).write_terminator();
-  if (aUTF16Count) {
-    *aUTF16Count = converter.Length();
-  }
-  return aBuffer;
-}
-
 char16_t*
 UTF8ToNewUnicode(const nsACString& aSource, uint32_t* aUTF16Count)
 {
   // Compute length plus one as required by ConvertUTF8toUTF16Func
   uint32_t lengthPlusOne = aSource.Length() + 1; // Can't overflow
 
   mozilla::CheckedInt<size_t> allocLength(lengthPlusOne);
   // Add space for zero-termination
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -18,17 +18,21 @@
 
 #include "nsTArrayForwardDeclare.h"
 
 // Can't include mozilla/Encoding.h here
 extern "C" {
   size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
   bool encoding_mem_is_ascii(char const* buffer, size_t buffer_len);
   bool encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
+  bool encoding_mem_is_utf8_latin1(char const* buffer, size_t buffer_len);
+  bool encoding_mem_is_str_latin1(char const* buffer, size_t buffer_len);
+  bool encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len);
   void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, size_t src_len, char* dst, size_t dst_len);
+  size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src, size_t src_len, char* dst, size_t dst_len);
   void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len);
   size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len);
   size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len);
 }
 
 // From the nsstring crate
 extern "C" {
   bool nsstring_fallible_append_utf8_impl(nsAString* aThis, const char* aOther, size_t aOtherLen, size_t aOldLen);
@@ -48,16 +52,29 @@ extern "C" {
  */
 inline void
 LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest)
 {
   encoding_mem_convert_utf16_to_latin1_lossy(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
 }
 
 /**
+ * If all the code points in the input are below U+0100, converts to Latin1, i.e. unsigned byte value is Unicode
+ * scalar value; not windows-1252. If there are code points above U+00FF, asserts in debug builds and produces
+ * garbage in release builds. The nature of the garbage may depend on the CPU architecture and must not be relied upon.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline size_t
+LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource, mozilla::Span<char> aDest)
+{
+  return encoding_mem_convert_utf8_to_latin1_lossy(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
  * Interprets unsigned byte value as Unicode scalar value (i.e. not windows-1251!).
  *
  * The length of aDest must be not be less than the length of aSource.
  */
 inline void
 ConvertLatin1toUTF16(mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest)
 {
   encoding_mem_convert_latin1_to_utf16(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
@@ -250,17 +267,16 @@ inline void LossyAppendUTF16toASCII(mozi
  * This conversion is not well defined; but it reproduces legacy string behavior.
  * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
  *
  * @param aSource a 16-bit wide string
  * @return a new |char| buffer you must free with |free|.
  */
 char* ToNewCString(const nsAString& aSource);
 
-
 /**
  * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
  *
  * Allocates and returns a new |char| buffer which you must free with |free|.
  * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
  *
  * @param aSource an 8-bit wide string
  * @return a new |char| buffer you must free with |free|.
@@ -309,48 +325,16 @@ char16_t* ToNewUnicode(const nsAString& 
  * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
  *
  * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
  * @return a new |char16_t| buffer you must free with |free|.
  */
 char16_t* ToNewUnicode(const nsACString& aSource);
 
 /**
- * Returns the required length for a char16_t buffer holding
- * a copy of aSource, using UTF-8 to UTF-16 conversion.
- * The length does NOT include any space for zero-termination.
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @return length of UTF-16 encoded string copy, not zero-terminated
- */
-uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
-
-/**
- * Copies the source string into the specified buffer, converting UTF-8 to
- * UTF-16 in the process. The conversion is well defined for valid UTF-8
- * strings.
- * The copied string will be zero-terminated! Any embedded nulls will be
- * copied nonetheless. It is the caller's responsiblity to ensure the buffer
- * is large enough to hold the string copy plus one char16_t for
- * zero-termination!
- *
- * @see CalcUTF8ToUnicodeLength( const nsACString& )
- * @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @param aBuffer the buffer holding the converted string copy
- * @param aUTF16Count receiving optionally the number of 16-bit units that
- *                    were copied
- * @return aBuffer pointer, for convenience
- */
-char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource,
-                              char16_t* aBuffer,
-                              uint32_t* aUTF16Count = nullptr);
-
-/**
  * Returns a new |char16_t| buffer containing a zero-terminated copy
  * of |aSource|.
  *
  * Allocates and returns a new |char| buffer which you must free with
  * |free|.  Performs an encoding conversion from UTF-8 to UTF-16
  * while copying |aSource| to your new buffer.  Malformed byte sequences
  * are replaced with the REPLACEMENT CHARACTER.  The new buffer is
  * zero-terminated, but that may not help you if |aSource| contains
@@ -427,16 +411,50 @@ inline bool IsASCII(mozilla::Span<const 
  * @param aString a 8-bit wide string to scan
  */
 inline bool IsASCII(mozilla::Span<const char> aString)
 {
   return encoding_mem_is_ascii(aString.Elements(), aString.Length());
 }
 
 /**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * @param aString a potentially-invalid UTF-16 string to scan
+ */
+inline bool IsUTF16Latin1(mozilla::Span<const char16_t> aString)
+{
+  return encoding_mem_is_utf16_latin1(aString.Elements(), aString.Length());
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * If you know that the argument is always absolutely guaranteed to be valid UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead.
+ *
+ * @param aString potentially-invalid UTF-8 string to scan
+ */
+inline bool IsUTF8Latin1(mozilla::Span<const char> aString)
+{
+  return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * The argument MUST be valid UTF-8. If you at all unsure, use IsUTF8Latin1 instead!
+ *
+ * @param aString known-valid UTF-8 string to scan
+ */
+inline bool UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString)
+{
+  return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
+}
+
+/**
  * Returns |true| if |aString| is a valid UTF-8 string.
  *
  * Note that this doesn't check whether the string might look like a valid
  * string in another encoding, too, e.g. ISO-2022-JP.
  *
  * @param aString an 8-bit wide string to scan
  */
 inline bool IsUTF8(const nsACString& aString)
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -273,376 +273,16 @@ public:
       *aBuffer = p;
       return 0xFFFD;
     }
 
     MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
   }
 };
 
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for converting
- * UTF-8 to UTF-16
- */
-class ConvertUTF8toUTF16
-{
-public:
-  typedef char value_type;
-  typedef char16_t buffer_type;
-
-  explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
-    : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
-  {
-  }
-
-  size_t Length() const
-  {
-    return mBuffer - mStart;
-  }
-
-  bool ErrorEncountered() const
-  {
-    return mErrorEncountered;
-  }
-
-  void write(const value_type* aStart, uint32_t aN)
-  {
-    if (mErrorEncountered) {
-      return;
-    }
-
-    // algorithm assumes utf8 units won't
-    // be spread across fragments
-    const value_type* p = aStart;
-    const value_type* end = aStart + aN;
-    buffer_type* out = mBuffer;
-    for (; p != end /* && *p */;) {
-      bool err;
-      uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
-
-      if (err) {
-        mErrorEncountered = true;
-        mBuffer = out;
-        return;
-      }
-
-      if (ucs4 >= PLANE1_BASE) {
-        *out++ = (buffer_type)H_SURROGATE(ucs4);
-        *out++ = (buffer_type)L_SURROGATE(ucs4);
-      } else {
-        *out++ = ucs4;
-      }
-    }
-    mBuffer = out;
-  }
-
-  void write_terminator()
-  {
-    *mBuffer = buffer_type(0);
-  }
-
-private:
-  buffer_type* const mStart;
-  buffer_type* mBuffer;
-  bool mErrorEncountered;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the length of the UTF-16 string equivalent to a UTF-8 string.
- */
-class CalculateUTF8Length
-{
-public:
-  typedef char value_type;
-
-  CalculateUTF8Length()
-    : mLength(0), mErrorEncountered(false)
-  {
-  }
-
-  size_t Length() const
-  {
-    return mLength;
-  }
-
-  void write(const value_type* aStart, uint32_t aN)
-  {
-    // ignore any further requests
-    if (mErrorEncountered) {
-      return;
-    }
-
-    // algorithm assumes utf8 units won't
-    // be spread across fragments
-    const value_type* p = aStart;
-    const value_type* end = aStart + aN;
-    for (; p < end /* && *p */; ++mLength) {
-      if (UTF8traits::isASCII(*p)) {
-        p += 1;
-      } else if (UTF8traits::is2byte(*p)) {
-        p += 2;
-      } else if (UTF8traits::is3byte(*p)) {
-        p += 3;
-      } else if (UTF8traits::is4byte(*p)) {
-        // Because a UTF-8 sequence of 4 bytes represents a codepoint
-        // greater than 0xFFFF, it will become a surrogate pair in the
-        // UTF-16 string, so add 1 more to mLength.
-        // This doesn't happen with is5byte and is6byte because they
-        // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
-        // converted to a single replacement character.
-
-        // However, there is one case when a 4 byte UTF-8 sequence will
-        // only generate 2 UTF-16 bytes. If we have a properly encoded
-        // sequence, but with an invalid value (too small or too big),
-        // that will result in a replacement character being written
-        // This replacement character is encoded as just 1 single
-        // UTF-16 character, which is 2 bytes.
-
-        // The below code therefore only adds 1 to mLength if the UTF8
-        // data will produce a decoded character which is greater than
-        // or equal to 0x010000 and less than 0x0110000.
-
-        // A 4byte UTF8 character is encoded as
-        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-        // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
-        // map to bit 17-21 in the final result. If these bits are
-        // between 0x01 and 0x11, that means that the final result is
-        // between 0x010000 and 0x110000. The below code reads these
-        // bits out and assigns them to c, but shifted up 4 bits to
-        // avoid having to shift twice.
-
-        // It doesn't matter what to do in the case where p + 4 > end
-        // since no UTF16 characters will be written in that case by
-        // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
-        // any of the surrogate bits are wrong since no UTF16
-        // characters will be written in that case either.
-
-        if (p + 4 <= end) {
-          uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
-                       ((uint32_t)(p[1] & 0x30));
-          if (c >= 0x010 && c < 0x110) {
-            ++mLength;
-          }
-        }
-
-        p += 4;
-      } else if (UTF8traits::is5byte(*p)) {
-        p += 5;
-      } else if (UTF8traits::is6byte(*p)) {
-        p += 6;
-      } else { // error
-        ++mLength; // to account for the decrement below
-        break;
-      }
-    }
-    if (p != end) {
-      NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-      --mLength; // The last multi-byte char wasn't complete, discard it.
-      mErrorEncountered = true;
-    }
-  }
-
-private:
-  size_t mLength;
-  bool mErrorEncountered;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for
- * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
- * (0xEFBFBD in UTF-8).
- */
-class ConvertUTF16toUTF8
-{
-public:
-  typedef char16_t value_type;
-  typedef char buffer_type;
-
-  // The error handling here is more lenient than that in
-  // |ConvertUTF8toUTF16|, but it's that way for backwards
-  // compatibility.
-
-  explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
-    : mStart(aBuffer), mBuffer(aBuffer)
-  {
-  }
-
-  size_t Size() const
-  {
-    return mBuffer - mStart;
-  }
-
-  void write(const value_type* aStart, uint32_t aN)
-  {
-    buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
-
-    for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
-      value_type c = *p;
-      if (!(c & 0xFF80)) { // U+0000 - U+007F
-        *out++ = (char)c;
-      } else if (!(c & 0xF800)) { // U+0100 - U+07FF
-        *out++ = 0xC0 | (char)(c >> 6);
-        *out++ = 0x80 | (char)(0x003F & c);
-      } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
-        *out++ = 0xE0 | (char)(c >> 12);
-        *out++ = 0x80 | (char)(0x003F & (c >> 6));
-        *out++ = 0x80 | (char)(0x003F & c);
-      } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
-        // D800- DBFF - High Surrogate
-        value_type h = c;
-
-        ++p;
-        if (p == end) {
-          // Treat broken characters as the Unicode
-          // replacement character 0xFFFD (0xEFBFBD in
-          // UTF-8)
-          *out++ = '\xEF';
-          *out++ = '\xBF';
-          *out++ = '\xBD';
-
-          UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
-          break;
-        }
-        c = *p;
-
-        if (NS_IS_LOW_SURROGATE(c)) {
-          // DC00- DFFF - Low Surrogate
-          // N = (H - D800) *400 + 10000 + ( L - DC00 )
-          uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
-
-          // 0001 0000-001F FFFF
-          *out++ = 0xF0 | (char)(ucs4 >> 18);
-          *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
-          *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
-          *out++ = 0x80 | (char)(0x003F & ucs4);
-        } else {
-          // Treat broken characters as the Unicode
-          // replacement character 0xFFFD (0xEFBFBD in
-          // UTF-8)
-          *out++ = '\xEF';
-          *out++ = '\xBF';
-          *out++ = '\xBD';
-
-          // The pointer to the next character points to the second
-          // 16-bit value, not beyond it, as per Unicode 5.0.0
-          // Chapter 3 C10, only the first code unit of an illegal
-          // sequence must be treated as an illegally terminated
-          // code unit sequence (also Chapter 3 D91, "isolated [not
-          // paired and ill-formed] UTF-16 code units in the range
-          // D800..DFFF are ill-formed").
-          p--;
-
-          UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
-        }
-      } else { // U+DC00 - U+DFFF
-        // Treat broken characters as the Unicode replacement
-        // character 0xFFFD (0xEFBFBD in UTF-8)
-        *out++ = '\xEF';
-        *out++ = '\xBF';
-        *out++ = '\xBD';
-
-        // DC00- DFFF - Low Surrogate
-        UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
-      }
-    }
-
-    mBuffer = out;
-  }
-
-  void write_terminator()
-  {
-    *mBuffer = buffer_type(0);
-  }
-
-private:
-  buffer_type* const mStart;
-  buffer_type* mBuffer;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
- * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
- */
-class CalculateUTF8Size
-{
-public:
-  typedef char16_t value_type;
-
-  CalculateUTF8Size()
-    : mSize(0)
-  {
-  }
-
-  size_t Size() const
-  {
-    return mSize;
-  }
-
-  void write(const value_type* aStart, uint32_t aN)
-  {
-    // Assume UCS2 surrogate pairs won't be spread across fragments.
-    for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
-      value_type c = *p;
-      if (!(c & 0xFF80)) { // U+0000 - U+007F
-        mSize += 1;
-      } else if (!(c & 0xF800)) { // U+0100 - U+07FF
-        mSize += 2;
-      } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
-        mSize += 3;
-      } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
-        ++p;
-        if (p == end) {
-          // Treat broken characters as the Unicode
-          // replacement character 0xFFFD (0xEFBFBD in
-          // UTF-8)
-          mSize += 3;
-
-          UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
-          break;
-        }
-        c = *p;
-
-        if (0xDC00 == (0xFC00 & c)) {
-          mSize += 4;
-        } else {
-          // Treat broken characters as the Unicode
-          // replacement character 0xFFFD (0xEFBFBD in
-          // UTF-8)
-          mSize += 3;
-
-          // The next code unit is the second 16-bit value, not
-          // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
-          // only the first code unit of an illegal sequence must
-          // be treated as an illegally terminated code unit
-          // sequence (also Chapter 3 D91, "isolated [not paired and
-          // ill-formed] UTF-16 code units in the range D800..DFFF
-          // are ill-formed").
-          p--;
-
-          UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
-        }
-      } else { // U+DC00 - U+DFFF
-        // Treat broken characters as the Unicode replacement
-        // character 0xFFFD (0xEFBFBD in UTF-8)
-        mSize += 3;
-
-        UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
-      }
-    }
-  }
-
-private:
-  size_t mSize;
-};
-
 #ifdef MOZILLA_INTERNAL_API
 /**
  * A character sink that performs a |reinterpret_cast|-style conversion
  * from char to char16_t.
  */
 class LossyConvertEncoding8to16
 {
 public:
author	Henri Sivonen <hsivonen@hsivonen.fi>
	Wed, 14 Mar 2018 19:22:21 +0200
changeset 768033	b8b72e2d353ba24a5efbffdab728463533b54ed3
parent 768032	162bcfce6cade5b58f29f0a289bc44355a5442fe
child 768034	c766164e384ccc9251b8efeb56a50cbe4b3e985c
push id	102784
push user	bmo:hsivonen@hsivonen.fi
push date	Thu, 15 Mar 2018 16:05:17 +0000
bugs	1402247
milestone	61.0a1
intl/encoding_glue/src/lib.rs		file \| annotate \| diff \| comparison \| revisions
js/xpconnect/src/XPCConvert.cpp		file \| annotate \| diff \| comparison \| revisions
xpcom/string/nsReadableUtils.cpp		file \| annotate \| diff \| comparison \| revisions
xpcom/string/nsReadableUtils.h		file \| annotate \| diff \| comparison \| revisions
xpcom/string/nsUTF8Utils.h		file \| annotate \| diff \| comparison \| revisions