Bug 1402247 part 7 - Use the new code from XPConnect.
MozReview-Commit-ID: 1p157aN1zqP
--- a/intl/encoding_glue/src/lib.rs
+++ b/intl/encoding_glue/src/lib.rs
@@ -597,21 +597,41 @@ pub unsafe extern "C" fn encoding_mem_is
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked(::std::slice::from_raw_parts(buffer, len)))
+}
+
+#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) {
encoding_rs::mem::convert_utf16_to_latin1_lossy(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
}
#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy(src: *const u8, src_len: usize, dst: *mut u8, dst_len: usize) -> usize {
+ encoding_rs::mem::convert_utf8_to_latin1_lossy(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
+}
+
+#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(src: *const u8, src_len: usize, dst: *mut u16, dst_len: usize) {
encoding_rs::mem::convert_latin1_to_utf16(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) -> usize {
encoding_rs::mem::convert_utf16_to_utf8(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
}
--- a/js/xpconnect/src/XPCConvert.cpp
+++ b/js/xpconnect/src/XPCConvert.cpp
@@ -258,51 +258,77 @@ XPCConvert::NativeData2JS(MutableHandleV
{
const nsACString* utf8String = *static_cast<const nsACString* const*>(s);
if (!utf8String || utf8String->IsVoid()) {
d.setNull();
return true;
}
- if (utf8String->IsEmpty()) {
+ uint32_t len = utf8String->Length();
+
+ if (!len) {
d.set(JS_GetEmptyStringValue(cx));
return true;
}
- const uint32_t len = CalcUTF8ToUnicodeLength(*utf8String);
- // The cString is not empty at this point, but the calculated
- // UTF-16 length is zero, meaning no valid conversion exists.
- if (!len)
- return false;
-
- const size_t buffer_size = (len + 1) * sizeof(char16_t);
- char16_t* buffer =
- static_cast<char16_t*>(JS_malloc(cx, buffer_size));
- if (!buffer)
- return false;
-
- uint32_t copied;
- if (!UTF8ToUnicodeBuffer(*utf8String, buffer, &copied) ||
- len != copied) {
- // Copy or conversion during copy failed. Did not copy the
- // whole string.
- JS_free(cx, buffer);
+ CheckedInt<uint32_t> allocLen(len);
+ allocLen += 1;
+ if (!allocLen.isValid()) {
return false;
}
- // JS_NewUCString takes ownership on success, i.e. a
+ // Usage of UTF-8 in XPConnect is mostly for things that are
+ // almost always ASCII, so the inexact allocations below
+ // should be fine.
+
+ if (IsUTF8Latin1(*utf8String)) {
+ char* buffer =
+ static_cast<char*>(JS_malloc(cx, len));
+ if (!buffer) {
+ return false;
+ }
+ size_t written = LossyConvertUTF8toLatin1(*utf8String, MakeSpan(buffer, len));
+ buffer[written] = 0;
+ // JS_NewLatin1String takes ownership on success, i.e. a
+ // successful call will make it the responsiblity of the JS VM
+ // to free the buffer.
+ // written can never exceed len, so the truncation is OK.
+ JSString* str = JS_NewLatin1String(cx, reinterpret_cast<JS::Latin1Char*>(buffer), written);
+ if (!str) {
+ JS_free(cx, buffer);
+ return false;
+ }
+ d.setString(str);
+ return true;
+ }
+
+ allocLen *= sizeof(char16_t);
+ if (!allocLen.isValid()) {
+ return false;
+ }
+
+ // ConvertUTF8toUTF16Func requires the destination to be one code unit
+ // longer than the source.
+ char16_t* buffer =
+ static_cast<char16_t*>(JS_malloc(cx, (len + 1) * sizeof(char16_t)));;
+ if (!buffer) {
+ return false;
+ }
+ size_t written = ConvertUTF8toUTF16Func(*utf8String, MakeSpan(buffer, len + 1));
+ buffer[written] = 0;
+ // JS_NewUCStringDontDeflate takes ownership on success, i.e. a
// successful call will make it the responsiblity of the JS VM
// to free the buffer.
- JSString* str = JS_NewUCString(cx, buffer, len);
+ // written can never exceed len + 1, so the truncation is OK.
+ JSString* str = JS_NewUCStringDontDeflate(cx, buffer, written);
if (!str) {
JS_free(cx, buffer);
return false;
}
-
d.setString(str);
return true;
}
case nsXPTType::T_CSTRING:
{
const nsACString* cString = *static_cast<const nsACString* const*>(s);
if (!cString || cString->IsVoid()) {
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -125,41 +125,16 @@ ToNewUnicode(const nsACString& aSource)
}
auto len = aSource.Length();
ConvertLatin1toUTF16(aSource, MakeSpan(dest, len));
dest[len] = 0;
return dest;
}
-uint32_t
-CalcUTF8ToUnicodeLength(const nsACString& aSource)
-{
- nsACString::const_iterator start, end;
- CalculateUTF8Length calculator;
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- calculator);
- return calculator.Length();
-}
-
-char16_t*
-UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer,
- uint32_t* aUTF16Count)
-{
- nsACString::const_iterator start, end;
- ConvertUTF8toUTF16 converter(aBuffer);
- copy_string(aSource.BeginReading(start),
- aSource.EndReading(end),
- converter).write_terminator();
- if (aUTF16Count) {
- *aUTF16Count = converter.Length();
- }
- return aBuffer;
-}
-
char16_t*
UTF8ToNewUnicode(const nsACString& aSource, uint32_t* aUTF16Count)
{
// Compute length plus one as required by ConvertUTF8toUTF16Func
uint32_t lengthPlusOne = aSource.Length() + 1; // Can't overflow
mozilla::CheckedInt<size_t> allocLength(lengthPlusOne);
// Add space for zero-termination
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -18,17 +18,21 @@
#include "nsTArrayForwardDeclare.h"
// Can't include mozilla/Encoding.h here
extern "C" {
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
bool encoding_mem_is_ascii(char const* buffer, size_t buffer_len);
bool encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
+ bool encoding_mem_is_utf8_latin1(char const* buffer, size_t buffer_len);
+ bool encoding_mem_is_str_latin1(char const* buffer, size_t buffer_len);
+ bool encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len);
void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, size_t src_len, char* dst, size_t dst_len);
+ size_t encoding_mem_convert_utf8_to_latin1_lossy(const char* src, size_t src_len, char* dst, size_t dst_len);
void encoding_mem_convert_latin1_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len);
size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, char* dst, size_t dst_len);
size_t encoding_mem_convert_utf8_to_utf16(const char* src, size_t src_len, char16_t* dst, size_t dst_len);
}
// From the nsstring crate
extern "C" {
bool nsstring_fallible_append_utf8_impl(nsAString* aThis, const char* aOther, size_t aOtherLen, size_t aOldLen);
@@ -48,16 +52,29 @@ extern "C" {
*/
inline void
LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest)
{
encoding_mem_convert_utf16_to_latin1_lossy(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
/**
+ * If all the code points in the input are below U+0100, converts to Latin1, i.e. unsigned byte value is Unicode
+ * scalar value; not windows-1252. If there are code points above U+00FF, asserts in debug builds and produces
+ * garbage in release builds. The nature of the garbage may depend on the CPU architecture and must not be relied upon.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline size_t
+LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource, mozilla::Span<char> aDest)
+{
+ return encoding_mem_convert_utf8_to_latin1_lossy(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
* Interprets unsigned byte value as Unicode scalar value (i.e. not windows-1251!).
*
* The length of aDest must be not be less than the length of aSource.
*/
inline void
ConvertLatin1toUTF16(mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest)
{
encoding_mem_convert_latin1_to_utf16(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
@@ -250,17 +267,16 @@ inline void LossyAppendUTF16toASCII(mozi
* This conversion is not well defined; but it reproduces legacy string behavior.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* @param aSource a 16-bit wide string
* @return a new |char| buffer you must free with |free|.
*/
char* ToNewCString(const nsAString& aSource);
-
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with |free|.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* @param aSource an 8-bit wide string
* @return a new |char| buffer you must free with |free|.
@@ -309,48 +325,16 @@ char16_t* ToNewUnicode(const nsAString&
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
* @return a new |char16_t| buffer you must free with |free|.
*/
char16_t* ToNewUnicode(const nsACString& aSource);
/**
- * Returns the required length for a char16_t buffer holding
- * a copy of aSource, using UTF-8 to UTF-16 conversion.
- * The length does NOT include any space for zero-termination.
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @return length of UTF-16 encoded string copy, not zero-terminated
- */
-uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
-
-/**
- * Copies the source string into the specified buffer, converting UTF-8 to
- * UTF-16 in the process. The conversion is well defined for valid UTF-8
- * strings.
- * The copied string will be zero-terminated! Any embedded nulls will be
- * copied nonetheless. It is the caller's responsiblity to ensure the buffer
- * is large enough to hold the string copy plus one char16_t for
- * zero-termination!
- *
- * @see CalcUTF8ToUnicodeLength( const nsACString& )
- * @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @param aBuffer the buffer holding the converted string copy
- * @param aUTF16Count receiving optionally the number of 16-bit units that
- * were copied
- * @return aBuffer pointer, for convenience
- */
-char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource,
- char16_t* aBuffer,
- uint32_t* aUTF16Count = nullptr);
-
-/**
* Returns a new |char16_t| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |free|. Performs an encoding conversion from UTF-8 to UTF-16
* while copying |aSource| to your new buffer. Malformed byte sequences
* are replaced with the REPLACEMENT CHARACTER. The new buffer is
* zero-terminated, but that may not help you if |aSource| contains
@@ -427,16 +411,50 @@ inline bool IsASCII(mozilla::Span<const
* @param aString a 8-bit wide string to scan
*/
inline bool IsASCII(mozilla::Span<const char> aString)
{
return encoding_mem_is_ascii(aString.Elements(), aString.Length());
}
/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * @param aString a potentially-invalid UTF-16 string to scan
+ */
+inline bool IsUTF16Latin1(mozilla::Span<const char16_t> aString)
+{
+ return encoding_mem_is_utf16_latin1(aString.Elements(), aString.Length());
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * If you know that the argument is always absolutely guaranteed to be valid UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead.
+ *
+ * @param aString potentially-invalid UTF-8 string to scan
+ */
+inline bool IsUTF8Latin1(mozilla::Span<const char> aString)
+{
+ return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is, characters in the range (U+0000, U+00FF).
+ *
+ * The argument MUST be valid UTF-8. If you at all unsure, use IsUTF8Latin1 instead!
+ *
+ * @param aString known-valid UTF-8 string to scan
+ */
+inline bool UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString)
+{
+ return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
+}
+
+/**
* Returns |true| if |aString| is a valid UTF-8 string.
*
* Note that this doesn't check whether the string might look like a valid
* string in another encoding, too, e.g. ISO-2022-JP.
*
* @param aString an 8-bit wide string to scan
*/
inline bool IsUTF8(const nsACString& aString)
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -273,376 +273,16 @@ public:
*aBuffer = p;
return 0xFFFD;
}
MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
}
};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for converting
- * UTF-8 to UTF-16
- */
-class ConvertUTF8toUTF16
-{
-public:
- typedef char value_type;
- typedef char16_t buffer_type;
-
- explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
- {
- }
-
- size_t Length() const
- {
- return mBuffer - mStart;
- }
-
- bool ErrorEncountered() const
- {
- return mErrorEncountered;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- if (mErrorEncountered) {
- return;
- }
-
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- buffer_type* out = mBuffer;
- for (; p != end /* && *p */;) {
- bool err;
- uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
-
- if (err) {
- mErrorEncountered = true;
- mBuffer = out;
- return;
- }
-
- if (ucs4 >= PLANE1_BASE) {
- *out++ = (buffer_type)H_SURROGATE(ucs4);
- *out++ = (buffer_type)L_SURROGATE(ucs4);
- } else {
- *out++ = ucs4;
- }
- }
- mBuffer = out;
- }
-
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
-
-private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
- bool mErrorEncountered;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the length of the UTF-16 string equivalent to a UTF-8 string.
- */
-class CalculateUTF8Length
-{
-public:
- typedef char value_type;
-
- CalculateUTF8Length()
- : mLength(0), mErrorEncountered(false)
- {
- }
-
- size_t Length() const
- {
- return mLength;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- // ignore any further requests
- if (mErrorEncountered) {
- return;
- }
-
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- for (; p < end /* && *p */; ++mLength) {
- if (UTF8traits::isASCII(*p)) {
- p += 1;
- } else if (UTF8traits::is2byte(*p)) {
- p += 2;
- } else if (UTF8traits::is3byte(*p)) {
- p += 3;
- } else if (UTF8traits::is4byte(*p)) {
- // Because a UTF-8 sequence of 4 bytes represents a codepoint
- // greater than 0xFFFF, it will become a surrogate pair in the
- // UTF-16 string, so add 1 more to mLength.
- // This doesn't happen with is5byte and is6byte because they
- // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
- // converted to a single replacement character.
-
- // However, there is one case when a 4 byte UTF-8 sequence will
- // only generate 2 UTF-16 bytes. If we have a properly encoded
- // sequence, but with an invalid value (too small or too big),
- // that will result in a replacement character being written
- // This replacement character is encoded as just 1 single
- // UTF-16 character, which is 2 bytes.
-
- // The below code therefore only adds 1 to mLength if the UTF8
- // data will produce a decoded character which is greater than
- // or equal to 0x010000 and less than 0x0110000.
-
- // A 4byte UTF8 character is encoded as
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
- // map to bit 17-21 in the final result. If these bits are
- // between 0x01 and 0x11, that means that the final result is
- // between 0x010000 and 0x110000. The below code reads these
- // bits out and assigns them to c, but shifted up 4 bits to
- // avoid having to shift twice.
-
- // It doesn't matter what to do in the case where p + 4 > end
- // since no UTF16 characters will be written in that case by
- // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
- // any of the surrogate bits are wrong since no UTF16
- // characters will be written in that case either.
-
- if (p + 4 <= end) {
- uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
- ((uint32_t)(p[1] & 0x30));
- if (c >= 0x010 && c < 0x110) {
- ++mLength;
- }
- }
-
- p += 4;
- } else if (UTF8traits::is5byte(*p)) {
- p += 5;
- } else if (UTF8traits::is6byte(*p)) {
- p += 6;
- } else { // error
- ++mLength; // to account for the decrement below
- break;
- }
- }
- if (p != end) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- --mLength; // The last multi-byte char wasn't complete, discard it.
- mErrorEncountered = true;
- }
- }
-
-private:
- size_t mLength;
- bool mErrorEncountered;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for
- * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
- * (0xEFBFBD in UTF-8).
- */
-class ConvertUTF16toUTF8
-{
-public:
- typedef char16_t value_type;
- typedef char buffer_type;
-
- // The error handling here is more lenient than that in
- // |ConvertUTF8toUTF16|, but it's that way for backwards
- // compatibility.
-
- explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer)
- {
- }
-
- size_t Size() const
- {
- return mBuffer - mStart;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
-
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- *out++ = (char)c;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- *out++ = 0xC0 | (char)(c >> 6);
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- *out++ = 0xE0 | (char)(c >> 12);
- *out++ = 0x80 | (char)(0x003F & (c >> 6));
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- // D800- DBFF - High Surrogate
- value_type h = c;
-
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
- break;
- }
- c = *p;
-
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + ( L - DC00 )
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
-
- // 0001 0000-001F FFFF
- *out++ = 0xF0 | (char)(ucs4 >> 18);
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
- *out++ = 0x80 | (char)(0x003F & ucs4);
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- // The pointer to the next character points to the second
- // 16-bit value, not beyond it, as per Unicode 5.0.0
- // Chapter 3 C10, only the first code unit of an illegal
- // sequence must be treated as an illegally terminated
- // code unit sequence (also Chapter 3 D91, "isolated [not
- // paired and ill-formed] UTF-16 code units in the range
- // D800..DFFF are ill-formed").
- p--;
-
- UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- // DC00- DFFF - Low Surrogate
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
- }
- }
-
- mBuffer = out;
- }
-
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
-
-private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
- * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
- */
-class CalculateUTF8Size
-{
-public:
- typedef char16_t value_type;
-
- CalculateUTF8Size()
- : mSize(0)
- {
- }
-
- size_t Size() const
- {
- return mSize;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- // Assume UCS2 surrogate pairs won't be spread across fragments.
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- mSize += 1;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- mSize += 2;
- } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- mSize += 3;
- } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
-
- UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
- break;
- }
- c = *p;
-
- if (0xDC00 == (0xFC00 & c)) {
- mSize += 4;
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
-
- // The next code unit is the second 16-bit value, not
- // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must
- // be treated as an illegally terminated code unit
- // sequence (also Chapter 3 D91, "isolated [not paired and
- // ill-formed] UTF-16 code units in the range D800..DFFF
- // are ill-formed").
- p--;
-
- UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- mSize += 3;
-
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
- }
- }
- }
-
-private:
- size_t mSize;
-};
-
#ifdef MOZILLA_INTERNAL_API
/**
* A character sink that performs a |reinterpret_cast|-style conversion
* from char to char16_t.
*/
class LossyConvertEncoding8to16
{
public: