--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1425,16 +1425,17 @@ dependencies = [
"nsstring 0.1.0",
]
[[package]]
name = "nsstring"
version = "0.1.0"
dependencies = [
"bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "nsstring-gtest"
version = "0.1.0"
dependencies = [
"nsstring 0.1.0",
]
--- a/intl/encoding_glue/src/lib.rs
+++ b/intl/encoding_glue/src/lib.rs
@@ -7,23 +7,23 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Adapted from third_party/rust/encoding_rs/src/lib.rs, so the
// "top-level directory" in the above notice refers to
// third_party/rust/encoding_rs/.
extern crate encoding_rs;
+extern crate nserror;
extern crate nsstring;
-extern crate nserror;
+use encoding_rs::*;
+use nserror::*;
+use nsstring::*;
use std::slice;
-use encoding_rs::*;
-use nsstring::*;
-use nserror::*;
// nsStringBuffer's internal bookkeeping takes 8 bytes from
// the allocation. Plus one for termination.
const NS_CSTRING_OVERHEAD: usize = 9;
/// Takes `Option<usize>`, the destination string and a value
/// to return on failure and tries to set the length of the
/// destination string to the `usize` wrapped in the first
@@ -585,8 +585,100 @@ fn checked_min(one: Option<usize>, other
}
// Bindings for encoding_rs::mem. These may move to a separate crate in the future.
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_utf16_bidi(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_utf16_bidi(::std::slice::from_raw_parts(buffer, len))
}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_ascii(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_ascii(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf8_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_utf8_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_utf16_latin1(buffer: *const u16, len: usize) -> bool {
+ encoding_rs::mem::is_utf16_latin1(::std::slice::from_raw_parts(buffer, len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_is_str_latin1(buffer: *const u8, len: usize) -> bool {
+ encoding_rs::mem::is_str_latin1(::std::str::from_utf8_unchecked(
+ ::std::slice::from_raw_parts(buffer, len),
+ ))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(
+ src: *const u16,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) {
+ encoding_rs::mem::convert_utf16_to_latin1_lossy(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_latin1_lossy(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf8_to_latin1_lossy(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) {
+ encoding_rs::mem::convert_latin1_to_utf16(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ );
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(
+ src: *const u16,
+ src_len: usize,
+ dst: *mut u8,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf16_to_utf8(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16(
+ src: *const u8,
+ src_len: usize,
+ dst: *mut u16,
+ dst_len: usize,
+) -> usize {
+ encoding_rs::mem::convert_utf8_to_utf16(
+ ::std::slice::from_raw_parts(src, src_len),
+ ::std::slice::from_raw_parts_mut(dst, dst_len),
+ )
+}
--- a/js/xpconnect/src/XPCConvert.cpp
+++ b/js/xpconnect/src/XPCConvert.cpp
@@ -277,46 +277,88 @@ XPCConvert::NativeData2JS(MutableHandleV
return true;
}
if (utf8String->IsEmpty()) {
d.set(JS_GetEmptyStringValue(cx));
return true;
}
- const uint32_t len = CalcUTF8ToUnicodeLength(*utf8String);
- // The cString is not empty at this point, but the calculated
- // UTF-16 length is zero, meaning no valid conversion exists.
- if (!len)
- return false;
-
- const size_t buffer_size = (len + 1) * sizeof(char16_t);
- char16_t* buffer =
- static_cast<char16_t*>(JS_malloc(cx, buffer_size));
- if (!buffer)
- return false;
-
- uint32_t copied;
- if (!UTF8ToUnicodeBuffer(*utf8String, buffer, &copied) ||
- len != copied) {
- // Copy or conversion during copy failed. Did not copy the
- // whole string.
- JS_free(cx, buffer);
+ uint32_t len = utf8String->Length();
+ auto allocLen = CheckedUint32(len) + 1;
+ if (!allocLen.isValid()) {
return false;
}
- // JS_NewUCString takes ownership on success, i.e. a
+ // Usage of UTF-8 in XPConnect is mostly for things that are
+ // almost always ASCII, so the inexact allocations below
+ // should be fine.
+
+ if (IsUTF8Latin1(*utf8String)) {
+ char* buffer = static_cast<char*>(JS_malloc(cx, allocLen.value()));
+ if (!buffer) {
+ return false;
+ }
+ size_t written =
+ LossyConvertUTF8toLatin1(*utf8String, MakeSpan(buffer, len));
+ buffer[written] = 0;
+
+ // JS_NewLatin1String takes ownership on success, i.e. a
+ // successful call will make it the responsiblity of the JS VM
+ // to free the buffer.
+ // written can never exceed len, so the truncation is OK.
+ JSString* str = JS_NewLatin1String(
+ cx, reinterpret_cast<JS::Latin1Char*>(buffer), written);
+ if (!str) {
+ JS_free(cx, buffer);
+ return false;
+ }
+ d.setString(str);
+ return true;
+ }
+
+ // 1-byte sequences decode to 1 UTF-16 code unit
+ // 2-byte sequences decode to 1 UTF-16 code unit
+ // 3-byte sequences decode to 1 UTF-16 code unit
+ // 4-byte sequences decode to 2 UTF-16 code units
+ // So the number of output code units never exceeds
+ // the number of input code units (but see the comment
+ // below). allocLen already takes the zero terminator
+ // into account.
+ allocLen *= sizeof(char16_t);
+ if (!allocLen.isValid()) {
+ return false;
+ }
+
+ char16_t* buffer =
+ static_cast<char16_t*>(JS_malloc(cx, allocLen.value()));
+ if (!buffer) {
+ return false;
+ }
+
+ // For its internal simplicity, ConvertUTF8toUTF16 requires the
+ // destination to be one code unit longer than the source, but
+ // it never actually writes more code units than the number of
+ // code units in the source. That's why it's OK to claim the
+ // output buffer has len + 1 space but then still expect to
+ // have space for the zero terminator.
+ size_t written =
+ ConvertUTF8toUTF16(*utf8String, MakeSpan(buffer, len + 1));
+ MOZ_RELEASE_ASSERT(written <= len);
+ buffer[written] = 0;
+
+ // JS_NewUCStringDontDeflate takes ownership on success, i.e. a
// successful call will make it the responsiblity of the JS VM
// to free the buffer.
- JSString* str = JS_NewUCString(cx, buffer, len);
+ // written can never exceed len + 1, so the truncation is OK.
+ JSString* str = JS_NewUCStringDontDeflate(cx, buffer, written);
if (!str) {
JS_free(cx, buffer);
return false;
}
-
d.setString(str);
return true;
}
case nsXPTType::T_CSTRING:
{
const nsACString* cString = static_cast<const nsACString*>(s);
if (!cString || cString->IsVoid()) {
@@ -1218,19 +1260,20 @@ JSErrorToXPCException(const char* toStri
Exception** exceptn)
{
AutoJSContext cx;
nsresult rv = NS_ERROR_FAILURE;
RefPtr<nsScriptError> data;
if (report) {
nsAutoString bestMessage;
if (report && report->message()) {
- CopyUTF8toUTF16(report->message().c_str(), bestMessage);
+ CopyUTF8toUTF16(mozilla::MakeStringSpan(report->message().c_str()),
+ bestMessage);
} else if (toStringResult) {
- CopyUTF8toUTF16(toStringResult, bestMessage);
+ CopyUTF8toUTF16(mozilla::MakeStringSpan(toStringResult), bestMessage);
} else {
bestMessage.AssignLiteral("JavaScript Error");
}
const char16_t* linebuf = report->linebuf();
data = new nsScriptError();
data->InitWithWindowID(
--- a/servo/ports/geckolib/glue.rs
+++ b/servo/ports/geckolib/glue.rs
@@ -3515,17 +3515,17 @@ pub extern "C" fn Servo_DeclarationBlock
pub extern "C" fn Servo_DeclarationBlock_GetNthProperty(
declarations: RawServoDeclarationBlockBorrowed,
index: u32,
result: *mut nsAString,
) -> bool {
read_locked_arc(declarations, |decls: &PropertyDeclarationBlock| {
if let Some(decl) = decls.declarations().get(index as usize) {
let result = unsafe { result.as_mut().unwrap() };
- result.assign_utf8(&decl.id().name());
+ result.assign_str(&decl.id().name());
true
} else {
false
}
})
}
macro_rules! get_property_id_from_property {
--- a/servo/support/gecko/nsstring/Cargo.toml
+++ b/servo/support/gecko/nsstring/Cargo.toml
@@ -5,9 +5,9 @@ authors = ["nobody@mozilla.com"]
license = "MPL-2.0"
description = "Rust bindings to xpcom string types"
[features]
gecko_debug = []
[dependencies]
bitflags = "1.0"
-
+encoding_rs = "0.8.0"
new file mode 100644
--- /dev/null
+++ b/servo/support/gecko/nsstring/src/conversions.rs
@@ -0,0 +1,712 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+extern crate encoding_rs;
+
+use std::slice;
+
+use super::nsACString;
+use super::nsAString;
+use super::nsCStringLike;
+use super::BulkWriteOk;
+use super::Gecko_FallibleAssignCString;
+use super::Latin1StringLike;
+
+use conversions::encoding_rs::mem::*;
+use conversions::encoding_rs::Encoding;
+
+/// Required math stated in the docs of
+/// `convert_utf16_to_utf8()`.
+#[inline(always)]
+fn times_three_plus_one(a: usize) -> Option<usize> {
+ a.checked_mul(3)?.checked_add(1)
+}
+
+#[inline(always)]
+fn identity(a: usize) -> Option<usize> {
+ Some(a)
+}
+
+#[inline(always)]
+fn plus_one(a: usize) -> Option<usize> {
+ a.checked_add(1)
+}
+
+/// Typical cache line size per
+/// https://stackoverflow.com/questions/14707803/line-size-of-l1-and-l2-caches
+///
+/// For consistent behavior, not trying to use 128 on aarch64
+/// or other fanciness like that.
+const CACHE_LINE: usize = 64;
+
+const CACHE_LINE_MASK: usize = CACHE_LINE - 1;
+
+#[inline(always)]
+fn starts_with_ascii(buffer: &[u8]) -> bool {
+ // We examine data only up to the end of the cache line
+ // to make this check minimally disruptive.
+ let bound = if buffer.len() <= CACHE_LINE {
+ buffer.len()
+ } else {
+ CACHE_LINE - ((buffer.as_ptr() as usize) & CACHE_LINE_MASK)
+ };
+ is_ascii(&buffer[..bound])
+}
+
+#[inline(always)]
+fn starts_with_basic_latin(buffer: &[u16]) -> bool {
+ // We look at two cache lines with code unit size of two. There is need
+ // to look at more than one cache line in the UTF-16 case, because looking
+ // at just one cache line wouldn't catch non-ASCII Latin with high enough
+ // probability with Latin-script languages that have relatively infrequent
+ // non-ASCII characters.
+ let bound = if buffer.len() <= CACHE_LINE {
+ buffer.len()
+ } else {
+ (CACHE_LINE * 2 - ((buffer.as_ptr() as usize) & CACHE_LINE_MASK)) / 2
+ };
+ is_basic_latin(&buffer[..bound])
+}
+
+// Ignoring the copy avoidance complications of conversions between Latin1 and
+// UTF-8, a conversion function has the outward form of
+// `fn F(&mut self, other: &[T], old_len: usize) -> Result<BulkWriteOk, ()>`,
+// where `T` is either `u8` or `u16`. `other` is the slice whose converted
+// content are to be appended to `self` and `old_len` indicates how many
+// code unit of `self` are to be preserved (0 for the assignment case and
+// `self.len()` for the appending case).
+//
+// As implementation parameters a conversion function needs to know the
+// math for computing the worst case conversion length in code units given
+// the input length in code units. For a _constant conversion_ the number
+// of code units the conversion produces equals the number of code units
+// in the input. For a _shinking conversion_ the maximum number of code
+// units the conversion can produce equals the number of code units in
+// the input, but the conversion can produce fewer code units. Still, due
+// to implementation details, the function might want _one_ unit more of
+// output space. For an _expanding conversion_ (no need for macro), the
+// minimum number of code units produced by the conversion is the number
+// of code units in the input, but the conversion can produce more.
+//
+// Copy avoidance conversions avoid copying a refcounted buffer when it's
+// ASCII-only.
+//
+// Internally, a conversion function needs to know the underlying
+// encoding_rs conversion function, the math for computing the required
+// output buffer size and, depending on the case, the underlying
+// encoding_rs ASCII prefix handling function.
+
+/// A conversion where the number of code units in the output is potentially
+/// smaller than the number of code units in the input.
+///
+/// Takes the name of the method to be generated, the name of the conversion
+/// function and the type of the input slice.
+///
+/// `$name` is the name of the function to generate
+/// `$convert` is the underlying `encoding_rs::mem` function to use
+/// `$other_ty` is the type of the input slice
+/// `$math` is the worst-case length math that `$convert` expects
+macro_rules! shrinking_conversion {
+ (name = $name:ident,
+ convert = $convert:ident,
+ other_ty = $other_ty:ty,
+ math = $math:ident) => (
+ fn $name(&mut self, other: $other_ty, old_len: usize) -> Result<BulkWriteOk, ()> {
+ let needed = $math(other.len()).ok_or(())?;
+ let mut handle = unsafe {
+ self.bulk_write(old_len.checked_add(needed).ok_or(())?, old_len, false)?
+ };
+ let written = $convert(other, &mut handle.as_mut_slice()[old_len..]);
+ Ok(handle.finish(old_len + written, true))
+ }
+ )
+}
+
+/// A conversion where the number of code units in the output is always equal
+/// to the number of code units in the input.
+///
+/// Takes the name of the method to be generated, the name of the conversion
+/// function and the type of the input slice.
+///
+/// `$name` is the name of the function to generate
+/// `$convert` is the underlying `encoding_rs::mem` function to use
+/// `$other_ty` is the type of the input slice
+macro_rules! constant_conversion {
+ (name = $name:ident,
+ convert = $convert:ident,
+ other_ty = $other_ty:ty) => (
+ fn $name(&mut self, other: $other_ty, old_len: usize) -> Result<BulkWriteOk, ()> {
+ let new_len = old_len.checked_add(other.len()).ok_or(())?;
+ let mut handle = unsafe {
+ self.bulk_write(new_len, old_len, true)?
+ };
+ $convert(other, &mut handle.as_mut_slice()[old_len..]);
+ Ok(handle.finish(new_len, false))
+ }
+ )
+}
+
+/// An intermediate check for avoiding a copy and having an `nsStringBuffer`
+/// refcount increment instead when both `self` and `other` are `nsACString`s,
+/// `other` is entirely ASCII and all old data in `self` is discarded.
+///
+/// `$name` is the name of the function to generate
+/// `$impl` is the underlying conversion that takes a slice and that is used
+/// when we can't just adopt the incoming buffer as-is
+/// `$string_like` is the kind of input taken
+macro_rules! ascii_copy_avoidance {
+ (name = $name:ident,
+ implementation = $implementation:ident,
+ string_like = $string_like:ident) => (
+ fn $name<T: $string_like + ?Sized>(&mut self, other: &T, old_len: usize) -> Result<BulkWriteOk, ()> {
+ let adapter = other.adapt();
+ let other_slice = adapter.as_ref();
+ let num_ascii = if adapter.is_abstract() && old_len == 0 {
+ let up_to = Encoding::ascii_valid_up_to(other_slice);
+ if up_to == other_slice.len() {
+ // Calling something whose argument can be obtained from
+ // the adapter rather than an nsStringLike avoids a huge
+ // lifetime mess by keeping nsStringLike and
+ // Latin1StringLike free of lifetime interdependencies.
+ if unsafe { Gecko_FallibleAssignCString(self, other.adapt().as_ptr()) } {
+ return Ok(BulkWriteOk{});
+ } else {
+ return Err(());
+ }
+ }
+ Some(up_to)
+ } else {
+ None
+ };
+ self.$implementation(other_slice, old_len, num_ascii)
+ }
+ )
+}
+
+impl nsAString {
+ // Valid UTF-8 to UTF-16
+
+ // Documentation says the destination buffer needs to have
+ // as many code units as the input.
+ shrinking_conversion!(
+ name = fallible_append_str_impl,
+ convert = convert_str_to_utf16,
+ other_ty = &str,
+ math = identity
+ );
+
+ /// Convert a valid UTF-8 string into valid UTF-16 and replace the content
+ /// of this string with the conversion result.
+ pub fn assign_str(&mut self, other: &str) {
+ self.fallible_append_str_impl(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a valid UTF-8 string into valid UTF-16 and fallibly replace the
+ /// content of this string with the conversion result.
+ pub fn fallible_assign_str(&mut self, other: &str) -> Result<(), ()> {
+ self.fallible_append_str_impl(other, 0).map(|_| ())
+ }
+
+ /// Convert a valid UTF-8 string into valid UTF-16 and append the conversion
+ /// to this string.
+ pub fn append_str(&mut self, other: &str) {
+ let len = self.len();
+ self.fallible_append_str_impl(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a valid UTF-8 string into valid UTF-16 and fallibly append the
+ /// conversion to this string.
+ pub fn fallible_append_str(&mut self, other: &str) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_str_impl(other, len).map(|_| ())
+ }
+
+ // Potentially-invalid UTF-8 to UTF-16
+
+ // Documentation says the destination buffer needs to have
+ // one more code unit than the input.
+ shrinking_conversion!(
+ name = fallible_append_utf8_impl,
+ convert = convert_utf8_to_utf16,
+ other_ty = &[u8],
+ math = plus_one
+ );
+
+ /// Convert a potentially-invalid UTF-8 string into valid UTF-16
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// replace the content of this string with the conversion result.
+ pub fn assign_utf8(&mut self, other: &[u8]) {
+ self.fallible_append_utf8_impl(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a potentially-invalid UTF-8 string into valid UTF-16
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// fallibly replace the content of this string with the conversion result.
+ pub fn fallible_assign_utf8(&mut self, other: &[u8]) -> Result<(), ()> {
+ self.fallible_append_utf8_impl(other, 0).map(|_| ())
+ }
+
+ /// Convert a potentially-invalid UTF-8 string into valid UTF-16
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// append the conversion result to this string.
+ pub fn append_utf8(&mut self, other: &[u8]) {
+ let len = self.len();
+ self.fallible_append_utf8_impl(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a potentially-invalid UTF-8 string into valid UTF-16
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// fallibly append the conversion result to this string.
+ pub fn fallible_append_utf8(&mut self, other: &[u8]) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_utf8_impl(other, len).map(|_| ())
+ }
+
+ // Latin1 to UTF-16
+
+ constant_conversion!(
+ name = fallible_append_latin1_impl,
+ convert = convert_latin1_to_utf16,
+ other_ty = &[u8]
+ );
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-16 and replace the content of this string with the conversion result.
+ pub fn assign_latin1(&mut self, other: &[u8]) {
+ self.fallible_append_latin1_impl(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-16 and fallibly replace the content of this string with the
+ /// conversion result.
+ pub fn fallible_assign_latin1(&mut self, other: &[u8]) -> Result<(), ()> {
+ self.fallible_append_latin1_impl(other, 0).map(|_| ())
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-16 and append the conversion result to this string.
+ pub fn append_latin1(&mut self, other: &[u8]) {
+ let len = self.len();
+ self.fallible_append_latin1_impl(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-16 and fallibly append the conversion result to this string.
+ pub fn fallible_append_latin1(&mut self, other: &[u8]) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_latin1_impl(other, len).map(|_| ())
+ }
+}
+
+impl nsACString {
+ // UTF-16 to UTF-8
+
+ fn fallible_append_utf16_to_utf8_impl(
+ &mut self,
+ other: &[u16],
+ old_len: usize,
+ ) -> Result<BulkWriteOk, ()> {
+ // We first size the buffer for ASCII if the first cache line is ASCII. If that turns out not to
+ // be enough, we size for the worst case given the length of the remaining input at that point.
+ let (filled, num_ascii, mut handle) = if starts_with_basic_latin(other) {
+ let new_len_with_ascii = old_len.checked_add(other.len()).ok_or(())?;
+ let mut handle = unsafe { self.bulk_write(new_len_with_ascii, old_len, false)? };
+ let num_ascii = copy_basic_latin_to_ascii(other, &mut handle.as_mut_slice()[old_len..]);
+ let left = other.len() - num_ascii;
+ if left == 0 {
+ return Ok(handle.finish(old_len + num_ascii, true));
+ }
+ let filled = old_len + num_ascii;
+ let needed = times_three_plus_one(left).ok_or(())?;
+ let new_len = filled.checked_add(needed).ok_or(())?;
+ unsafe {
+ handle.restart_bulk_write(new_len, filled, false)?;
+ }
+ (filled, num_ascii, handle)
+ } else {
+ // Started with non-ASCII. Compute worst case
+ let needed = times_three_plus_one(other.len()).ok_or(())?;
+ let new_len = old_len.checked_add(needed).ok_or(())?;
+ let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
+ (old_len, 0, handle)
+ };
+ let written =
+ convert_utf16_to_utf8(&other[num_ascii..], &mut handle.as_mut_slice()[filled..]);
+ Ok(handle.finish(filled + written, true))
+ }
+
+ /// Convert a potentially-invalid UTF-16 string into valid UTF-8
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// replace the content of this string with the conversion result.
+ pub fn assign_utf16_to_utf8(&mut self, other: &[u16]) {
+ self.fallible_append_utf16_to_utf8_impl(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a potentially-invalid UTF-16 string into valid UTF-8
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// fallibly replace the content of this string with the conversion result.
+ pub fn fallible_assign_utf16_to_utf8(&mut self, other: &[u16]) -> Result<(), ()> {
+ self.fallible_append_utf16_to_utf8_impl(other, 0)
+ .map(|_| ())
+ }
+
+ /// Convert a potentially-invalid UTF-16 string into valid UTF-8
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// append the conversion result to this string.
+ pub fn append_utf16_to_utf8(&mut self, other: &[u16]) {
+ let len = self.len();
+ self.fallible_append_utf16_to_utf8_impl(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a potentially-invalid UTF-16 string into valid UTF-8
+ /// (replacing invalid sequences with the REPLACEMENT CHARACTER) and
+ /// fallibly append the conversion result to this string.
+ pub fn fallible_append_utf16_to_utf8(&mut self, other: &[u16]) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_utf16_to_utf8_impl(other, len)
+ .map(|_| ())
+ }
+
+ // UTF-16 to Latin1
+
+ constant_conversion!(
+ name = fallible_append_utf16_to_latin1_lossy_impl,
+ convert = convert_utf16_to_latin1_lossy,
+ other_ty = &[u16]
+ );
+
+ /// Convert a UTF-16 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// replace the content of this string with the conversion result.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-16,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn assign_utf16_to_latin1_lossy(&mut self, other: &[u16]) {
+ self.fallible_append_utf16_to_latin1_lossy_impl(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a UTF-16 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// fallibly replace the content of this string with the conversion result.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-16,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn fallible_assign_utf16_to_latin1_lossy(&mut self, other: &[u16]) -> Result<(), ()> {
+ self.fallible_append_utf16_to_latin1_lossy_impl(other, 0)
+ .map(|_| ())
+ }
+
+ /// Convert a UTF-16 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// append the conversion result to this string.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-16,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn append_utf16_to_latin1_lossy(&mut self, other: &[u16]) {
+ let len = self.len();
+ self.fallible_append_utf16_to_latin1_lossy_impl(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a UTF-16 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// fallibly append the conversion result to this string.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-16,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn fallible_append_utf16_to_latin1_lossy(&mut self, other: &[u16]) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_utf16_to_latin1_lossy_impl(other, len)
+ .map(|_| ())
+ }
+
+ // UTF-8 to Latin1
+
+ ascii_copy_avoidance!(
+ name = fallible_append_utf8_to_latin1_lossy_check,
+ implementation = fallible_append_utf8_to_latin1_lossy_impl,
+ string_like = nsCStringLike
+ );
+
+ fn fallible_append_utf8_to_latin1_lossy_impl(
+ &mut self,
+ other: &[u8],
+ old_len: usize,
+ maybe_num_ascii: Option<usize>,
+ ) -> Result<BulkWriteOk, ()> {
+ let new_len = old_len.checked_add(other.len()).ok_or(())?;
+ let num_ascii = maybe_num_ascii.unwrap_or(0);
+ // Already checked for overflow above, so this can't overflow.
+ let old_len_plus_num_ascii = old_len + num_ascii;
+ let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
+ let written = {
+ let buffer = handle.as_mut_slice();
+ if num_ascii != 0 {
+ (&mut buffer[old_len..old_len_plus_num_ascii]).copy_from_slice(&other[..num_ascii]);
+ }
+ convert_utf8_to_latin1_lossy(&other[num_ascii..], &mut buffer[old_len_plus_num_ascii..])
+ };
+ Ok(handle.finish(old_len_plus_num_ascii + written, true))
+ }
+
+ /// Convert a UTF-8 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// replace the content of this string with the conversion result.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-8,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn assign_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
+ self.fallible_append_utf8_to_latin1_lossy_check(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a UTF-8 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// fallibly replace the content of this string with the conversion result.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-8,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn fallible_assign_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(
+ &mut self,
+ other: &T,
+ ) -> Result<(), ()> {
+ self.fallible_append_utf8_to_latin1_lossy_check(other, 0)
+ .map(|_| ())
+ }
+
+ /// Convert a UTF-8 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// append the conversion result to this string.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-8,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn append_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
+ let len = self.len();
+ self.fallible_append_utf8_to_latin1_lossy_check(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a UTF-8 string whose all code points are below U+0100 into
+ /// a Latin1 (scalar value is byte value; not windows-1252!) string and
+ /// fallibly append the conversion result to this string.
+ ///
+ /// # Panics
+ ///
+ /// If the input contains code points above U+00FF or is not valid UTF-8,
+ /// panics in debug mode and produces garbage in a memory-safe way in
+ /// release builds. The nature of the garbage may differ based on CPU
+ /// architecture and must not be relied upon.
+ pub fn fallible_append_utf8_to_latin1_lossy<T: nsCStringLike + ?Sized>(
+ &mut self,
+ other: &T,
+ ) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_utf8_to_latin1_lossy_check(other, len)
+ .map(|_| ())
+ }
+
+ // Latin1 to UTF-8 CString
+
+ ascii_copy_avoidance!(
+ name = fallible_append_latin1_to_utf8_check,
+ implementation = fallible_append_latin1_to_utf8_impl,
+ string_like = Latin1StringLike
+ );
+
+ fn fallible_append_latin1_to_utf8_impl(
+ &mut self,
+ other: &[u8],
+ old_len: usize,
+ maybe_num_ascii: Option<usize>,
+ ) -> Result<BulkWriteOk, ()> {
+ let (filled, num_ascii, mut handle) = if let Some(num_ascii) = maybe_num_ascii {
+ // Wrapper checked for ASCII
+ let left = other.len() - num_ascii;
+ let filled = old_len + num_ascii;
+ let needed = left.checked_mul(2).ok_or(())?;
+ let new_len = filled.checked_add(needed).ok_or(())?;
+ let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
+ if num_ascii != 0 {
+ (&mut handle.as_mut_slice()[old_len..filled]).copy_from_slice(&other[..num_ascii]);
+ }
+ (filled, num_ascii, handle)
+ } else if starts_with_ascii(other) {
+ // Wrapper didn't check for ASCII, so let's see if `other` starts with ASCII
+ // `other` starts with ASCII, so let's first size the buffer
+ // with optimism that it's ASCII-only.
+ let new_len_with_ascii = old_len.checked_add(other.len()).ok_or(())?;
+ let mut handle = unsafe { self.bulk_write(new_len_with_ascii, old_len, false)? };
+ let num_ascii = copy_ascii_to_ascii(other, &mut handle.as_mut_slice()[old_len..]);
+ let left = other.len() - num_ascii;
+ let filled = old_len + num_ascii;
+ if left == 0 {
+ // `other` was all ASCII
+ return Ok(handle.finish(filled, true));
+ }
+ let needed = left.checked_mul(2).ok_or(())?;
+ let new_len = filled.checked_add(needed).ok_or(())?;
+ unsafe {
+ handle.restart_bulk_write(new_len, filled, false)?;
+ }
+ (filled, num_ascii, handle)
+ } else {
+ // Started with non-ASCII. Assume worst case.
+ let needed = other.len().checked_mul(2).ok_or(())?;
+ let new_len = old_len.checked_add(needed).ok_or(())?;
+ let mut handle = unsafe { self.bulk_write(new_len, old_len, false)? };
+ (old_len, 0, handle)
+ };
+ let written =
+ convert_latin1_to_utf8(&other[num_ascii..], &mut handle.as_mut_slice()[filled..]);
+ Ok(handle.finish(filled + written, true))
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-8 and replace the content of this string with the conversion result.
+ pub fn assign_latin1_to_utf8<T: Latin1StringLike + ?Sized>(&mut self, other: &T) {
+ self.fallible_append_latin1_to_utf8_check(other, 0)
+ .expect("Out of memory");
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-8 and fallibly replace the content of this string with the
+ /// conversion result.
+ pub fn fallible_assign_latin1_to_utf8<T: Latin1StringLike + ?Sized>(
+ &mut self,
+ other: &T,
+ ) -> Result<(), ()> {
+ self.fallible_append_latin1_to_utf8_check(other, 0)
+ .map(|_| ())
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-8 and append the conversion result to this string.
+ pub fn append_latin1_to_utf8<T: Latin1StringLike + ?Sized>(&mut self, other: &T) {
+ let len = self.len();
+ self.fallible_append_latin1_to_utf8_check(other, len)
+ .expect("Out of memory");
+ }
+
+ /// Convert a Latin1 (i.e. byte value equals scalar value; not windows-1252!)
+ /// into UTF-8 and fallibly append the conversion result to this string.
+ pub fn fallible_append_latin1_to_utf8<T: Latin1StringLike + ?Sized>(
+ &mut self,
+ other: &T,
+ ) -> Result<(), ()> {
+ let len = self.len();
+ self.fallible_append_latin1_to_utf8_check(other, len)
+ .map(|_| ())
+ }
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nsstring_fallible_append_utf8_impl(
+ this: *mut nsAString,
+ other: *const u8,
+ other_len: usize,
+ old_len: usize,
+) -> bool {
+ let other_slice = slice::from_raw_parts(other, other_len);
+ (*this)
+ .fallible_append_utf8_impl(other_slice, old_len)
+ .is_ok()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nsstring_fallible_append_latin1_impl(
+ this: *mut nsAString,
+ other: *const u8,
+ other_len: usize,
+ old_len: usize,
+) -> bool {
+ let other_slice = slice::from_raw_parts(other, other_len);
+ (*this)
+ .fallible_append_latin1_impl(other_slice, old_len)
+ .is_ok()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nscstring_fallible_append_utf16_to_utf8_impl(
+ this: *mut nsACString,
+ other: *const u16,
+ other_len: usize,
+ old_len: usize,
+) -> bool {
+ let other_slice = slice::from_raw_parts(other, other_len);
+ (*this)
+ .fallible_append_utf16_to_utf8_impl(other_slice, old_len)
+ .is_ok()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nscstring_fallible_append_utf16_to_latin1_lossy_impl(
+ this: *mut nsACString,
+ other: *const u16,
+ other_len: usize,
+ old_len: usize,
+) -> bool {
+ let other_slice = slice::from_raw_parts(other, other_len);
+ (*this)
+ .fallible_append_utf16_to_latin1_lossy_impl(other_slice, old_len)
+ .is_ok()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nscstring_fallible_append_utf8_to_latin1_lossy_check(
+ this: *mut nsACString,
+ other: *const nsACString,
+ old_len: usize,
+) -> bool {
+ (*this)
+ .fallible_append_utf8_to_latin1_lossy_check(&*other, old_len)
+ .is_ok()
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn nscstring_fallible_append_latin1_to_utf8_check(
+ this: *mut nsACString,
+ other: *const nsACString,
+ old_len: usize,
+) -> bool {
+ (*this)
+ .fallible_append_latin1_to_utf8_check(&*other, old_len)
+ .is_ok()
+}
--- a/servo/support/gecko/nsstring/src/lib.rs
+++ b/servo/support/gecko/nsstring/src/lib.rs
@@ -119,20 +119,36 @@ extern crate bitflags;
use std::borrow;
use std::cmp;
use std::fmt;
use std::marker::PhantomData;
use std::mem;
use std::ops::{Deref, DerefMut};
use std::os::raw::c_void;
+use std::ptr;
use std::slice;
use std::str;
use std::u32;
+mod conversions;
+
+pub use self::conversions::nscstring_fallible_append_latin1_to_utf8_check;
+pub use self::conversions::nscstring_fallible_append_utf16_to_latin1_lossy_impl;
+pub use self::conversions::nscstring_fallible_append_utf16_to_utf8_impl;
+pub use self::conversions::nscstring_fallible_append_utf8_to_latin1_lossy_check;
+pub use self::conversions::nsstring_fallible_append_latin1_impl;
+pub use self::conversions::nsstring_fallible_append_utf8_impl;
+
+/// A type for showing that `finish()` was called on a `BulkWriteHandle`.
+/// Instantiating this type from elsewhere is basically an assertion that
+/// there is no `BulkWriteHandle` around, so be very careful with instantiating
+/// this type!
+pub struct BulkWriteOk;
+
///////////////////////////////////
// Internal Implementation Flags //
///////////////////////////////////
mod data_flags {
bitflags! {
// While this has the same layout as u16, it cannot be passed
// over FFI safely as a u16.
@@ -163,60 +179,203 @@ mod class_flags {
use class_flags::ClassFlags;
use data_flags::DataFlags;
////////////////////////////////////
// Generic String Bindings Macros //
////////////////////////////////////
+macro_rules! string_like {
+ {
+ char_t = $char_t: ty;
+
+ AString = $AString: ident;
+ String = $String: ident;
+ Str = $Str: ident;
+
+ StringLike = $StringLike: ident;
+ StringAdapter = $StringAdapter: ident;
+ } => {
+ /// This trait is implemented on types which are `ns[C]String`-like, in
+ /// that they can at very low cost be converted to a borrowed
+ /// `&nsA[C]String`. Unfortunately, the intermediate type
+ /// `ns[C]StringAdapter` is required as well due to types like `&[u8]`
+ /// needing to be (cheaply) wrapped in a `nsCString` on the stack to
+ /// create the `&nsACString`.
+ ///
+ /// This trait is used to DWIM when calling the methods on
+ /// `nsA[C]String`.
+ pub trait $StringLike {
+ fn adapt(&self) -> $StringAdapter;
+ }
+
+ impl<'a, T: $StringLike + ?Sized> $StringLike for &'a T {
+ fn adapt(&self) -> $StringAdapter {
+ <T as $StringLike>::adapt(*self)
+ }
+ }
+
+ impl<'a, T> $StringLike for borrow::Cow<'a, T>
+ where T: $StringLike + borrow::ToOwned + ?Sized {
+ fn adapt(&self) -> $StringAdapter {
+ <T as $StringLike>::adapt(self.as_ref())
+ }
+ }
+
+ impl $StringLike for $AString {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Abstract(self)
+ }
+ }
+
+ impl<'a> $StringLike for $Str<'a> {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Abstract(self)
+ }
+ }
+
+ impl $StringLike for $String {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Abstract(self)
+ }
+ }
+
+ impl $StringLike for [$char_t] {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Borrowed($Str::from(self))
+ }
+ }
+
+ impl $StringLike for Vec<$char_t> {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Borrowed($Str::from(&self[..]))
+ }
+ }
+
+ impl $StringLike for Box<[$char_t]> {
+ fn adapt(&self) -> $StringAdapter {
+ $StringAdapter::Borrowed($Str::from(&self[..]))
+ }
+ }
+ }
+}
+
+impl<'a> Drop for nsAStringBulkWriteHandle<'a> {
+ /// This only runs in error cases. In success cases, `finish()`
+ /// calls `forget(self)`.
+ fn drop(&mut self) {
+ if self.capacity == 0 {
+ // If capacity is 0, the string is a zero-length
+ // string, so we have nothing to do.
+ return;
+ }
+ // The old zero terminator may be gone by now, so we need
+ // to write a new one somewhere and make length match.
+ // We can use a length between 1 and self.capacity.
+ // Seems prudent to overwrite the uninitialized memory.
+ // Using the length 1 leaves the shortest memory to overwrite.
+ // U+FFFD is the safest placeholder. Merely truncating the
+ // string to a zero-length string might be dangerous in some
+ // scenarios. See
+ // https://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences
+ // for closely related scenario.
+ unsafe {
+ let mut this = self.string.as_repr();
+ this.as_mut().length = 1u32;
+ *(this.as_mut().data.as_mut()) = 0xFFFDu16;
+ *(this.as_mut().data.as_ptr().offset(1isize)) = 0;
+ }
+ }
+}
+
+impl<'a> Drop for nsACStringBulkWriteHandle<'a> {
+ /// This only runs in error cases. In success cases, `finish()`
+ /// calls `forget(self)`.
+ fn drop(&mut self) {
+ if self.capacity == 0 {
+ // If capacity is 0, the string is a zero-length
+ // string, so we have nothing to do.
+ return;
+ }
+ // The old zero terminator may be gone by now, so we need
+ // to write a new one somewhere and make length match.
+ // We can use a length between 1 and self.capacity.
+ // Seems prudent to overwrite the uninitialized memory.
+ // Using the length 1 leaves the shortest memory to overwrite.
+ // U+FFFD is the safest placeholder, but when it doesn't fit,
+ // let's use ASCII substitute. Merely truncating the
+ // string to a zero-length string might be dangerous in some
+ // scenarios. See
+ // https://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences
+ // for closely related scenario.
+ unsafe {
+ let mut this = self.string.as_repr();
+ if self.capacity >= 3 {
+ this.as_mut().length = 3u32;
+ *(this.as_mut().data.as_mut()) = 0xEFu8;
+ *(this.as_mut().data.as_ptr().offset(1isize)) = 0xBFu8;
+ *(this.as_mut().data.as_ptr().offset(2isize)) = 0xBDu8;
+ *(this.as_mut().data.as_ptr().offset(3isize)) = 0;
+ } else {
+ this.as_mut().length = 1u32;
+ *(this.as_mut().data.as_mut()) = 0x1Au8; // U+FFFD doesn't fit
+ *(this.as_mut().data.as_ptr().offset(1isize)) = 0;
+ }
+ }
+ }
+}
+
macro_rules! define_string_types {
{
char_t = $char_t: ty;
AString = $AString: ident;
String = $String: ident;
Str = $Str: ident;
StringLike = $StringLike: ident;
StringAdapter = $StringAdapter: ident;
StringRepr = $StringRepr: ident;
+ BulkWriteHandle = $BulkWriteHandle: ident;
+
drop = $drop: ident;
assign = $assign: ident, $fallible_assign: ident;
take_from = $take_from: ident, $fallible_take_from: ident;
append = $append: ident, $fallible_append: ident;
set_length = $set_length: ident, $fallible_set_length: ident;
begin_writing = $begin_writing: ident, $fallible_begin_writing: ident;
+ start_bulk_write = $start_bulk_write: ident;
} => {
/// The representation of a ns[C]String type in C++. This type is
/// used internally by our definition of ns[C]String to ensure layout
/// compatibility with the C++ ns[C]String type.
///
/// This type may also be used in place of a C++ ns[C]String inside of
/// struct definitions which are shared with C++, as it has identical
/// layout to our ns[C]String type.
///
/// This struct will leak its data if dropped from rust. See the module
/// documentation for more information on this type.
#[repr(C)]
#[derive(Debug)]
pub struct $StringRepr {
- data: *const $char_t,
+ data: ptr::NonNull<$char_t>,
length: u32,
dataflags: DataFlags,
classflags: ClassFlags,
}
impl $StringRepr {
fn new(classflags: ClassFlags) -> $StringRepr {
static NUL: $char_t = 0;
$StringRepr {
- data: &NUL,
+ data: unsafe { ptr::NonNull::new_unchecked(&NUL as *const _ as *mut _) },
length: 0,
dataflags: DataFlags::TERMINATED | DataFlags::LITERAL,
classflags: classflags,
}
}
}
impl Deref for $StringRepr {
@@ -231,16 +390,73 @@ macro_rules! define_string_types {
impl DerefMut for $StringRepr {
fn deref_mut(&mut self) -> &mut $AString {
unsafe {
mem::transmute(self)
}
}
}
+ pub struct $BulkWriteHandle<'a> {
+ string: &'a mut $AString,
+ capacity: usize,
+ }
+
+ impl<'a> $BulkWriteHandle<'a> {
+ fn new(string: &'a mut $AString, capacity: usize) -> Self {
+ $BulkWriteHandle{ string: string, capacity: capacity }
+ }
+
+ pub unsafe fn restart_bulk_write(&mut self,
+ capacity: usize,
+ units_to_preserve: usize,
+ allow_shrinking: bool) -> Result<(), ()> {
+ self.capacity =
+ self.string.start_bulk_write_impl(capacity,
+ units_to_preserve,
+ allow_shrinking)?;
+ Ok(())
+ }
+
+ pub fn finish(mut self, length: usize, allow_shrinking: bool) -> BulkWriteOk {
+ // NOTE: Drop is implemented outside the macro earlier in this file,
+ // because it needs to deal with different code unit representations
+ // for the REPLACEMENT CHARACTER in the UTF-16 and UTF-8 cases and
+ // needs to deal with a REPLACEMENT CHARACTER not fitting in the
+ // buffer in the UTF-8 case.
+ assert!(length <= self.capacity);
+ if length == 0 {
+ // `truncate()` is OK even when the string
+ // is in invalid state.
+ self.string.truncate();
+ mem::forget(self); // Don't run the failure path in drop()
+ return BulkWriteOk{};
+ }
+ if allow_shrinking {
+ unsafe {
+ let _ = self.restart_bulk_write(length, length, true);
+ }
+ }
+ unsafe {
+ let mut this = self.string.as_repr();
+ this.as_mut().length = length as u32;
+ *(this.as_mut().data.as_ptr().offset(length as isize)) = 0;
+ }
+ mem::forget(self); // Don't run the failure path in drop()
+ BulkWriteOk{}
+ }
+
+ pub fn as_mut_slice(&mut self) -> &mut [$char_t] {
+ unsafe {
+ let mut this = self.string.as_repr();
+ slice::from_raw_parts_mut(this.as_mut().data.as_ptr(), self.capacity)
+ }
+ }
+ }
+
/// This type is the abstract type which is used for interacting with
/// strings in rust. Each string type can derefence to an instance of
/// this type, which provides the useful operations on strings.
///
/// NOTE: Rust thinks this type has a size of 0, because the data
/// associated with it is not necessarially safe to move. It is not safe
/// to construct a nsAString yourself, unless it is received by
/// dereferencing one of these types.
@@ -340,65 +556,100 @@ macro_rules! define_string_types {
/// Get a `&mut` reference to the backing data for this string.
/// This method will allocate and copy if the current backing buffer
/// is immutable or shared.
pub fn to_mut(&mut self) -> &mut [$char_t] {
unsafe {
let len = self.len();
if len == 0 {
- // Use an arbitrary non-null value as the pointer
- slice::from_raw_parts_mut(0x1 as *mut $char_t, 0)
+ // Use an arbitrary but aligned non-null value as the pointer
+ slice::from_raw_parts_mut(ptr::NonNull::<$char_t>::dangling().as_ptr(), 0)
} else {
slice::from_raw_parts_mut($begin_writing(self), len)
}
}
}
/// Get a `&mut` reference to the backing data for this string.
/// This method will allocate and copy if the current backing buffer
/// is immutable or shared.
///
/// Returns `Ok(&mut [T])` on success, and `Err(())` if the
/// allocation failed.
pub fn fallible_to_mut(&mut self) -> Result<&mut [$char_t], ()> {
unsafe {
let len = self.len();
if len == 0 {
- // Use an arbitrary non-null value as the pointer
- Ok(slice::from_raw_parts_mut(0x1 as *mut $char_t, 0))
+ // Use an arbitrary but aligned non-null value as the pointer
+ Ok(slice::from_raw_parts_mut(
+ ptr::NonNull::<$char_t>::dangling().as_ptr() as *mut $char_t, 0))
} else {
let ptr = $fallible_begin_writing(self);
if ptr.is_null() {
Err(())
} else {
Ok(slice::from_raw_parts_mut(ptr, len))
}
}
}
}
+ /// Unshares the buffer of the string and returns a handle
+ /// from which a writable slice whose length is the rounded-up
+ /// capacity can be obtained.
+ ///
+ /// Fails also if the new length doesn't fit in 32 bits.
+ ///
+ /// # Safety
+ ///
+ /// Unsafe because of exposure of uninitialized memory.
+ pub unsafe fn bulk_write(&mut self,
+ capacity: usize,
+ units_to_preserve: usize,
+ allow_shrinking: bool) -> Result<$BulkWriteHandle, ()> {
+ let capacity =
+ self.start_bulk_write_impl(capacity, units_to_preserve, allow_shrinking)?;
+ Ok($BulkWriteHandle::new(self, capacity))
+ }
+
+ unsafe fn start_bulk_write_impl(&mut self,
+ capacity: usize,
+ units_to_preserve: usize,
+ allow_shrinking: bool) -> Result<usize, ()> {
+ if capacity > u32::max_value() as usize {
+ Err(())
+ } else {
+ let capacity32 = capacity as u32;
+ let rounded = $start_bulk_write(self,
+ capacity32,
+ units_to_preserve as u32,
+ allow_shrinking);
+ if rounded == u32::max_value() {
+ return Err(())
+ }
+ Ok(rounded as usize)
+ }
+ }
+
+ fn as_repr(&mut self) -> ptr::NonNull<$StringRepr> {
+ unsafe { ptr::NonNull::new_unchecked(self as *mut _ as *mut $StringRepr)}
+ }
}
impl Deref for $AString {
type Target = [$char_t];
fn deref(&self) -> &[$char_t] {
unsafe {
// All $AString values point to a struct prefix which is
// identical to $StringRepr, this we can transmute `self`
// into $StringRepr to get the reference to the underlying
// data.
let this: &$StringRepr = mem::transmute(self);
- if this.data.is_null() {
- debug_assert_eq!(this.length, 0);
- // Use an arbitrary non-null value as the pointer
- slice::from_raw_parts(0x1 as *const $char_t, 0)
- } else {
- slice::from_raw_parts(this.data, this.length as usize)
- }
+ slice::from_raw_parts(this.data.as_ptr(), this.length as usize)
}
}
}
impl AsRef<[$char_t]> for $AString {
fn as_ref(&self) -> &[$char_t] {
self
}
@@ -473,17 +724,17 @@ macro_rules! define_string_types {
impl<'a> From<&'a [$char_t]> for $Str<'a> {
fn from(s: &'a [$char_t]) -> $Str<'a> {
assert!(s.len() < (u32::MAX as usize));
if s.is_empty() {
return $Str::new();
}
$Str {
hdr: $StringRepr {
- data: s.as_ptr(),
+ data: unsafe { ptr::NonNull::new_unchecked(s.as_ptr() as *mut _) },
length: s.len() as u32,
dataflags: DataFlags::empty(),
classflags: ClassFlags::empty(),
},
_marker: PhantomData,
}
}
}
@@ -633,24 +884,24 @@ macro_rules! define_string_types {
let length = s.len() as u32;
s.push(0); // null terminator
// SAFETY NOTE: This method produces an data_flags::OWNED
// ns[C]String from a Box<[$char_t]>. this is only safe
// because in the Gecko tree, we use the same allocator for
// Rust code as for C++ code, meaning that our box can be
// legally freed with libc::free().
- let ptr = s.as_ptr();
+ let ptr = s.as_mut_ptr();
mem::forget(s);
unsafe {
Gecko_IncrementStringAdoptCount(ptr as *mut _);
}
$String {
hdr: $StringRepr {
- data: ptr,
+ data: unsafe { ptr::NonNull::new_unchecked(ptr) },
length: length,
dataflags: DataFlags::OWNED | DataFlags::TERMINATED,
classflags: ClassFlags::NULL_TERMINATED,
}
}
}
}
@@ -722,76 +973,35 @@ macro_rules! define_string_types {
fn deref(&self) -> &$AString {
match *self {
$StringAdapter::Borrowed(ref s) => s,
$StringAdapter::Abstract(ref s) => s,
}
}
}
- /// This trait is implemented on types which are `ns[C]String`-like, in
- /// that they can at very low cost be converted to a borrowed
- /// `&nsA[C]String`. Unfortunately, the intermediate type
- /// `ns[C]StringAdapter` is required as well due to types like `&[u8]`
- /// needing to be (cheaply) wrapped in a `nsCString` on the stack to
- /// create the `&nsACString`.
- ///
- /// This trait is used to DWIM when calling the methods on
- /// `nsA[C]String`.
- pub trait $StringLike {
- fn adapt(&self) -> $StringAdapter;
- }
-
- impl<'a, T: $StringLike + ?Sized> $StringLike for &'a T {
- fn adapt(&self) -> $StringAdapter {
- <T as $StringLike>::adapt(*self)
- }
- }
-
- impl<'a, T> $StringLike for borrow::Cow<'a, T>
- where T: $StringLike + borrow::ToOwned + ?Sized {
- fn adapt(&self) -> $StringAdapter {
- <T as $StringLike>::adapt(self.as_ref())
+ impl<'a> $StringAdapter<'a> {
+ #[allow(dead_code)]
+ fn is_abstract(&self) -> bool {
+ match *self {
+ $StringAdapter::Borrowed(_) => false,
+ $StringAdapter::Abstract(_) => true,
+ }
}
}
- impl $StringLike for $AString {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Abstract(self)
- }
- }
-
- impl<'a> $StringLike for $Str<'a> {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Abstract(self)
- }
- }
+ string_like! {
+ char_t = $char_t;
- impl $StringLike for $String {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Abstract(self)
- }
- }
+ AString = $AString;
+ String = $String;
+ Str = $Str;
- impl $StringLike for [$char_t] {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Borrowed($Str::from(self))
- }
- }
-
- impl $StringLike for Vec<$char_t> {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Borrowed($Str::from(&self[..]))
- }
- }
-
- impl $StringLike for Box<[$char_t]> {
- fn adapt(&self) -> $StringAdapter {
- $StringAdapter::Borrowed($Str::from(&self[..]))
- }
+ StringLike = $StringLike;
+ StringAdapter = $StringAdapter;
}
}
}
///////////////////////////////////////////
// Bindings for nsCString (u8 char type) //
///////////////////////////////////////////
@@ -802,49 +1012,28 @@ define_string_types! {
String = nsCString;
Str = nsCStr;
StringLike = nsCStringLike;
StringAdapter = nsCStringAdapter;
StringRepr = nsCStringRepr;
+ BulkWriteHandle = nsACStringBulkWriteHandle;
+
drop = Gecko_FinalizeCString;
assign = Gecko_AssignCString, Gecko_FallibleAssignCString;
take_from = Gecko_TakeFromCString, Gecko_FallibleTakeFromCString;
append = Gecko_AppendCString, Gecko_FallibleAppendCString;
set_length = Gecko_SetLengthCString, Gecko_FallibleSetLengthCString;
begin_writing = Gecko_BeginWritingCString, Gecko_FallibleBeginWritingCString;
+ start_bulk_write = Gecko_StartBulkWriteCString;
}
impl nsACString {
- pub fn assign_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) {
- self.truncate();
- self.append_utf16(other);
- }
-
- pub fn fallible_assign_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
- self.truncate();
- self.fallible_append_utf16(other)
- }
-
- pub fn append_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) {
- unsafe {
- Gecko_AppendUTF16toCString(self, other.adapt().as_ptr());
- }
- }
-
- pub fn fallible_append_utf16<T: nsStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
- if unsafe { Gecko_FallibleAppendUTF16toCString(self, other.adapt().as_ptr()) } {
- Ok(())
- } else {
- Err(())
- }
- }
-
pub unsafe fn as_str_unchecked(&self) -> &str {
str::from_utf8_unchecked(self)
}
}
impl<'a> From<&'a str> for nsCStr<'a> {
fn from(s: &'a str) -> nsCStr<'a> {
s.as_bytes().into()
@@ -920,64 +1109,58 @@ impl nsCStringLike for String {
}
impl nsCStringLike for Box<str> {
fn adapt(&self) -> nsCStringAdapter {
nsCStringAdapter::Borrowed(nsCStr::from(&self[..]))
}
}
+/// This trait is implemented on types which are Latin1 `nsCString`-like,
+/// in that they can at very low cost be converted to a borrowed
+/// `&nsACString` and do not denote UTF-8ness in the Rust type system.
+///
+/// This trait is used to DWIM when calling the methods on
+/// `nsACString`.
+string_like! {
+ char_t = u8;
+
+ AString = nsACString;
+ String = nsCString;
+ Str = nsCStr;
+
+ StringLike = Latin1StringLike;
+ StringAdapter = nsCStringAdapter;
+}
+
///////////////////////////////////////////
// Bindings for nsString (u16 char type) //
///////////////////////////////////////////
define_string_types! {
char_t = u16;
AString = nsAString;
String = nsString;
Str = nsStr;
StringLike = nsStringLike;
StringAdapter = nsStringAdapter;
StringRepr = nsStringRepr;
+ BulkWriteHandle = nsAStringBulkWriteHandle;
+
drop = Gecko_FinalizeString;
assign = Gecko_AssignString, Gecko_FallibleAssignString;
take_from = Gecko_TakeFromString, Gecko_FallibleTakeFromString;
append = Gecko_AppendString, Gecko_FallibleAppendString;
set_length = Gecko_SetLengthString, Gecko_FallibleSetLengthString;
begin_writing = Gecko_BeginWritingString, Gecko_FallibleBeginWritingString;
-}
-
-impl nsAString {
- pub fn assign_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
- self.truncate();
- self.append_utf8(other);
- }
-
- pub fn fallible_assign_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
- self.truncate();
- self.fallible_append_utf8(other)
- }
-
- pub fn append_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) {
- unsafe {
- Gecko_AppendUTF8toString(self, other.adapt().as_ptr());
- }
- }
-
- pub fn fallible_append_utf8<T: nsCStringLike + ?Sized>(&mut self, other: &T) -> Result<(), ()> {
- if unsafe { Gecko_FallibleAppendUTF8toString(self, other.adapt().as_ptr()) } {
- Ok(())
- } else {
- Err(())
- }
- }
+ start_bulk_write = Gecko_StartBulkWriteString;
}
// NOTE: The From impl for a string slice for nsString produces a <'static>
// lifetime, as it allocates.
impl<'a> From<&'a str> for nsString {
fn from(s: &'a str) -> nsString {
s.encode_utf16().collect::<Vec<u16>>().into()
}
@@ -989,17 +1172,17 @@ impl<'a> From<&'a String> for nsString {
}
}
// Support for the write!() macro for writing to nsStrings
impl fmt::Write for nsAString {
fn write_str(&mut self, s: &str) -> Result<(), fmt::Error> {
// Directly invoke gecko's routines for appending utf8 strings to
// nsAString values, to avoid as much overhead as possible
- self.append_utf8(s);
+ self.append_str(s);
Ok(())
}
}
impl fmt::Display for nsAString {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
fmt::Display::fmt(&String::from_utf16_lossy(&self[..]), f)
}
@@ -1033,52 +1216,58 @@ extern "C" {
fn Gecko_AppendCString(this: *mut nsACString, other: *const nsACString);
fn Gecko_SetLengthCString(this: *mut nsACString, length: u32);
fn Gecko_BeginWritingCString(this: *mut nsACString) -> *mut u8;
fn Gecko_FallibleAssignCString(this: *mut nsACString, other: *const nsACString) -> bool;
fn Gecko_FallibleTakeFromCString(this: *mut nsACString, other: *mut nsACString) -> bool;
fn Gecko_FallibleAppendCString(this: *mut nsACString, other: *const nsACString) -> bool;
fn Gecko_FallibleSetLengthCString(this: *mut nsACString, length: u32) -> bool;
fn Gecko_FallibleBeginWritingCString(this: *mut nsACString) -> *mut u8;
+ fn Gecko_StartBulkWriteCString(
+ this: *mut nsACString,
+ capacity: u32,
+ units_to_preserve: u32,
+ allow_shrinking: bool,
+ ) -> u32;
fn Gecko_FinalizeString(this: *mut nsAString);
fn Gecko_AssignString(this: *mut nsAString, other: *const nsAString);
fn Gecko_TakeFromString(this: *mut nsAString, other: *mut nsAString);
fn Gecko_AppendString(this: *mut nsAString, other: *const nsAString);
fn Gecko_SetLengthString(this: *mut nsAString, length: u32);
fn Gecko_BeginWritingString(this: *mut nsAString) -> *mut u16;
fn Gecko_FallibleAssignString(this: *mut nsAString, other: *const nsAString) -> bool;
fn Gecko_FallibleTakeFromString(this: *mut nsAString, other: *mut nsAString) -> bool;
fn Gecko_FallibleAppendString(this: *mut nsAString, other: *const nsAString) -> bool;
fn Gecko_FallibleSetLengthString(this: *mut nsAString, length: u32) -> bool;
fn Gecko_FallibleBeginWritingString(this: *mut nsAString) -> *mut u16;
-
- // Gecko implementation in nsReadableUtils.cpp
- fn Gecko_AppendUTF16toCString(this: *mut nsACString, other: *const nsAString);
- fn Gecko_AppendUTF8toString(this: *mut nsAString, other: *const nsACString);
- fn Gecko_FallibleAppendUTF16toCString(this: *mut nsACString, other: *const nsAString) -> bool;
- fn Gecko_FallibleAppendUTF8toString(this: *mut nsAString, other: *const nsACString) -> bool;
+ fn Gecko_StartBulkWriteString(
+ this: *mut nsAString,
+ capacity: u32,
+ units_to_preserve: u32,
+ allow_shrinking: bool,
+ ) -> u32;
}
//////////////////////////////////////
// Repr Validation Helper Functions //
//////////////////////////////////////
pub mod test_helpers {
//! This module only exists to help with ensuring that the layout of the
//! structs inside of rust and C++ are identical.
//!
//! It is public to ensure that these testing functions are avaliable to
//! gtest code.
- use std::mem;
- use super::{ClassFlags, DataFlags};
use super::{nsCStr, nsCString, nsCStringRepr};
use super::{nsStr, nsString, nsStringRepr};
+ use super::{ClassFlags, DataFlags};
+ use std::mem;
/// Generates an #[no_mangle] extern "C" function which returns the size and
/// alignment of the given type with the given name.
macro_rules! size_align_check {
($T:ty, $fname:ident) => {
#[no_mangle]
#[allow(non_snake_case)]
pub extern fn $fname(size: *mut usize, align: *mut usize) {
--- a/toolkit/xre/nsWindowsRestart.cpp
+++ b/toolkit/xre/nsWindowsRestart.cpp
@@ -23,24 +23,25 @@
/**
* Convert UTF8 to UTF16 without using the normal XPCOM goop, which we
* can't link to updater.exe.
*/
static char16_t*
AllocConvertUTF8toUTF16(const char *arg)
{
// UTF16 can't be longer in units than UTF8
- int len = strlen(arg);
+ size_t len = strlen(arg);
char16_t *s = new char16_t[(len + 1) * sizeof(char16_t)];
if (!s)
return nullptr;
- ConvertUTF8toUTF16 convert(s);
- convert.write(arg, len);
- convert.write_terminator();
+ size_t dstLen = ::MultiByteToWideChar(
+ CP_UTF8, 0, arg, len, reinterpret_cast<wchar_t*>(s), len);
+ s[dstLen] = 0;
+
return s;
}
static void
FreeAllocStrings(int argc, wchar_t **argv)
{
while (argc) {
--argc;
--- a/toolkit/xre/nsWindowsWMain.cpp
+++ b/toolkit/xre/nsWindowsWMain.cpp
@@ -72,24 +72,27 @@ SanitizeEnvironmentVariables()
delete[] originalPath;
}
}
static char*
AllocConvertUTF16toUTF8(char16ptr_t arg)
{
// be generous... UTF16 units can expand up to 3 UTF8 units
- int len = wcslen(arg);
- char *s = new char[len * 3 + 1];
+ size_t len = wcslen(arg);
+ // ConvertUTF16toUTF8 requires +1. Let's do that here, too, lacking
+ // knowledge of Windows internals.
+ size_t dstLen = len * 3 + 1;
+ char* s = new char[dstLen + 1]; // Another +1 for zero terminator
if (!s)
return nullptr;
- ConvertUTF16toUTF8 convert(s);
- convert.write(arg, len);
- convert.write_terminator();
+ int written =
+ ::WideCharToMultiByte(CP_UTF8, 0, arg, len, s, dstLen, nullptr, nullptr);
+ s[written] = 0;
return s;
}
static void
FreeAllocStrings(int argc, char **argv)
{
while (argc) {
--argc;
--- a/xpcom/base/nsAlgorithm.h
+++ b/xpcom/base/nsAlgorithm.h
@@ -54,22 +54,9 @@ NS_COUNT(InputIterator& aFirst, const In
uint32_t result = 0;
for (; aFirst != aLast; ++aFirst)
if (*aFirst == aValue) {
++result;
}
return result;
}
-template <class InputIterator, class OutputIterator>
-inline OutputIterator&
-copy_string(const InputIterator& aFirst, const InputIterator& aLast,
- OutputIterator& aResult)
-{
- typedef nsCharSourceTraits<InputIterator> source_traits;
- typedef nsCharSinkTraits<OutputIterator> sink_traits;
-
- sink_traits::write(aResult, source_traits::read(aFirst),
- source_traits::readable_distance(aFirst, aLast));
- return aResult;
-}
-
#endif // !defined(nsAlgorithm_h___)
--- a/xpcom/ds/nsAtomTable.cpp
+++ b/xpcom/ds/nsAtomTable.cpp
@@ -191,28 +191,25 @@ struct AtomTableKey
: mUTF16String(aUTF16String)
, mUTF8String(nullptr)
, mLength(aLength)
{
mHash = HashString(mUTF16String, mLength);
*aHashOut = mHash;
}
- AtomTableKey(const char* aUTF8String, uint32_t aLength, uint32_t* aHashOut)
+ AtomTableKey(const char* aUTF8String,
+ uint32_t aLength,
+ uint32_t* aHashOut,
+ bool* aErr)
: mUTF16String(nullptr)
, mUTF8String(aUTF8String)
, mLength(aLength)
{
- bool err;
- mHash = HashUTF8AsUTF16(mUTF8String, mLength, &err);
- if (err) {
- mUTF8String = nullptr;
- mLength = 0;
- mHash = 0;
- }
+ mHash = HashUTF8AsUTF16(mUTF8String, mLength, aErr);
*aHashOut = mHash;
}
const char16_t* mUTF16String;
const char* mUTF8String;
uint32_t mLength;
uint32_t mHash;
};
@@ -328,20 +325,22 @@ AtomTableGetHash(const void* aKey)
static bool
AtomTableMatchKey(const PLDHashEntryHdr* aEntry, const void* aKey)
{
const AtomTableEntry* he = static_cast<const AtomTableEntry*>(aEntry);
const AtomTableKey* k = static_cast<const AtomTableKey*>(aKey);
if (k->mUTF8String) {
- return
- CompareUTF8toUTF16(nsDependentCSubstring(k->mUTF8String,
- k->mUTF8String + k->mLength),
- nsDependentAtomString(he->mAtom)) == 0;
+ bool err = false;
+ return (CompareUTF8toUTF16(nsDependentCSubstring(
+ k->mUTF8String, k->mUTF8String + k->mLength),
+ nsDependentAtomString(he->mAtom),
+ &err) == 0) &&
+ !err;
}
return he->mAtom->Equals(k->mUTF16String, k->mLength);
}
void
nsAtomTable::AtomTableClearEntry(PLDHashTable* aTable, PLDHashEntryHdr* aEntry)
{
@@ -682,17 +681,26 @@ NS_Atomize(const char* aUTF8String)
MOZ_ASSERT(gAtomTable);
return gAtomTable->Atomize(nsDependentCString(aUTF8String));
}
already_AddRefed<nsAtom>
nsAtomTable::Atomize(const nsACString& aUTF8String)
{
uint32_t hash;
- AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash);
+ bool err;
+ AtomTableKey key(aUTF8String.Data(), aUTF8String.Length(), &hash, &err);
+ if (MOZ_UNLIKELY(err)) {
+ MOZ_ASSERT_UNREACHABLE("Tried to atomize invalid UTF-8.");
+ // The input was invalid UTF-8. Let's replace the errors with U+FFFD
+ // and atomize the result.
+ nsString str;
+ CopyUTF8toUTF16(aUTF8String, str);
+ return Atomize(str);
+ }
nsAtomSubTable& table = SelectSubTable(key);
MutexAutoLock lock(table.mLock);
AtomTableEntry* he = table.Add(key);
if (he->mAtom) {
RefPtr<nsAtom> atom = he->mAtom;
return atom.forget();
--- a/xpcom/string/moz.build
+++ b/xpcom/string/moz.build
@@ -44,21 +44,9 @@ UNIFIED_SOURCES += [
'nsStringComparator.cpp',
'nsStringObsolete.cpp',
'nsSubstring.cpp',
'nsTextFormatter.cpp',
'nsTSubstringTuple.cpp',
'precompiled_templates.cpp',
]
-# Are we targeting x86 or x86-64? If so, compile the SSE2 functions for
-# nsUTF8Utils.cpp and nsReadableUtils.cpp.
-if CONFIG['INTEL_ARCHITECTURE']:
- SOURCES += ['nsUTF8UtilsSSE2.cpp']
- SOURCES['nsUTF8UtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
- SOURCES += ['nsReadableUtilsSSE2.cpp']
- SOURCES['nsReadableUtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
-
-if CONFIG['HAVE_ARM_NEON'] or CONFIG['CPU_ARCH'] == 'aarch64':
- SOURCES += ['nsUTF8UtilsNEON.cpp']
- SOURCES['nsUTF8UtilsNEON.cpp'].flags += CONFIG['NEON_FLAGS']
-
FINAL_LIBRARY = 'xul'
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -1,793 +1,246 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsReadableUtils.h"
-#include "nsReadableUtilsImpl.h"
#include <algorithm>
#include "mozilla/CheckedInt.h"
#include "nscore.h"
#include "nsMemory.h"
#include "nsString.h"
#include "nsTArray.h"
#include "nsUTF8Utils.h"
-using mozilla::IsASCII;
-
-/**
- * Fallback implementation for finding the first non-ASCII character in a
- * UTF-16 string.
- */
-static inline int32_t
-FirstNonASCIIUnvectorized(const char16_t* aBegin, const char16_t* aEnd)
-{
- typedef mozilla::NonASCIIParameters<sizeof(size_t)> p;
- const size_t kMask = p::mask();
- const uintptr_t kAlignMask = p::alignMask();
- const size_t kNumUnicharsPerWord = p::numUnicharsPerWord();
-
- const char16_t* idx = aBegin;
-
- // Align ourselves to a word boundary.
- for (; idx != aEnd && ((uintptr_t(idx) & kAlignMask) != 0); idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- // Check one word at a time.
- const char16_t* wordWalkEnd = mozilla::aligned(aEnd, kAlignMask);
- for (; idx != wordWalkEnd; idx += kNumUnicharsPerWord) {
- const size_t word = *reinterpret_cast<const size_t*>(idx);
- if (word & kMask) {
- return idx - aBegin;
- }
- }
-
- // Take care of the remainder one character at a time.
- for (; idx != aEnd; idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- return -1;
-}
-
-/*
- * This function returns -1 if all characters in str are ASCII characters.
- * Otherwise, it returns a value less than or equal to the index of the first
- * ASCII character in str. For example, if first non-ASCII character is at
- * position 25, it may return 25, 24, or 16. But it guarantees
- * there are only ASCII characters before returned value.
- */
-static inline int32_t
-FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd)
-{
-#ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- return mozilla::SSE2::FirstNonASCII(aBegin, aEnd);
- }
-#endif
-
- return FirstNonASCIIUnvectorized(aBegin, aEnd);
-}
-
-void
-LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest)
-{
- aDest.Truncate();
- LossyAppendUTF16toASCII(aSource, aDest);
-}
-
-void
-CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest)
-{
- if (!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible)) {
- // Note that this may wildly underestimate the allocation that failed, as
- // we report the length of aSource as UTF-16 instead of UTF-8.
- aDest.AllocFailed(aDest.Length() + aSource.Length());
- }
-}
-
-bool
-CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
- const mozilla::fallible_t& aFallible)
-{
- aDest.Truncate();
- return AppendASCIItoUTF16(aSource, aDest, aFallible);
-}
-
-void
-LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest)
-{
- aDest.Truncate();
- if (aSource) {
- LossyAppendUTF16toASCII(nsDependentString(aSource), aDest);
- }
-}
-
-void
-CopyASCIItoUTF16(const char* aSource, nsAString& aDest)
-{
- aDest.Truncate();
- if (aSource) {
- AppendASCIItoUTF16(nsDependentCString(aSource), aDest);
- }
-}
-
-void
-CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest)
-{
- if (!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible)) {
- // Note that this may wildly underestimate the allocation that failed, as
- // we report the length of aSource as UTF-16 instead of UTF-8.
- aDest.AllocFailed(aDest.Length() + aSource.Length());
- }
-}
-
-bool
-CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
- const mozilla::fallible_t& aFallible)
-{
- aDest.Truncate();
- if (!AppendUTF16toUTF8(aSource, aDest, aFallible)) {
- return false;
- }
- return true;
-}
-
-void
-CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest)
-{
- aDest.Truncate();
- AppendUTF8toUTF16(aSource, aDest);
-}
-
-void
-CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest)
-{
- aDest.Truncate();
- AppendUTF16toUTF8(aSource, aDest);
-}
-
-void
-CopyUTF8toUTF16(const char* aSource, nsAString& aDest)
-{
- aDest.Truncate();
- AppendUTF8toUTF16(aSource, aDest);
-}
-
-void
-LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest)
-{
- uint32_t old_dest_length = aDest.Length();
- aDest.SetLength(old_dest_length + aSource.Length());
-
- nsAString::const_iterator fromBegin, fromEnd;
-
- nsACString::iterator dest;
- aDest.BeginWriting(dest);
-
- dest.advance(old_dest_length);
-
- // right now, this won't work on multi-fragment destinations
- LossyConvertEncoding16to8 converter(dest.get());
-
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter);
-}
-
-void
-AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest)
-{
- if (!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible)) {
- aDest.AllocFailed(aDest.Length() + aSource.Length());
- }
-}
-
-bool
-AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
- const mozilla::fallible_t& aFallible)
-{
- uint32_t old_dest_length = aDest.Length();
- if (!aDest.SetLength(old_dest_length + aSource.Length(),
- aFallible)) {
- return false;
- }
-
- nsACString::const_iterator fromBegin, fromEnd;
-
- nsAString::iterator dest;
- aDest.BeginWriting(dest);
-
- dest.advance(old_dest_length);
-
- // right now, this won't work on multi-fragment destinations
- LossyConvertEncoding8to16 converter(dest.get());
-
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter);
- return true;
-}
-
-void
-LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest)
-{
- if (aSource) {
- LossyAppendUTF16toASCII(nsDependentString(aSource), aDest);
- }
-}
-
-bool
-AppendASCIItoUTF16(const char* aSource, nsAString& aDest, const mozilla::fallible_t& aFallible)
-{
- if (aSource) {
- return AppendASCIItoUTF16(nsDependentCString(aSource), aDest, aFallible);
- }
-
- return true;
-}
-
-void
-AppendASCIItoUTF16(const char* aSource, nsAString& aDest)
-{
- if (aSource) {
- AppendASCIItoUTF16(nsDependentCString(aSource), aDest);
- }
-}
-
-void
-AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest)
-{
- if (!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible)) {
- // Note that this may wildly underestimate the allocation that failed, as
- // we report the length of aSource as UTF-16 instead of UTF-8.
- aDest.AllocFailed(aDest.Length() + aSource.Length());
- }
-}
-
-bool
-AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
- const mozilla::fallible_t& aFallible)
-{
- // At 16 characters analysis showed better performance of both the all ASCII
- // and non-ASCII cases, so we limit calling |FirstNonASCII| to strings of
- // that length.
- const nsAString::size_type kFastPathMinLength = 16;
-
- int32_t firstNonASCII = 0;
- if (aSource.Length() >= kFastPathMinLength) {
- firstNonASCII = FirstNonASCII(aSource.BeginReading(), aSource.EndReading());
- }
-
- if (firstNonASCII == -1) {
- // This is all ASCII, we can use the more efficient lossy append.
- mozilla::CheckedInt<nsACString::size_type> new_length(aSource.Length());
- new_length += aDest.Length();
-
- if (!new_length.isValid() ||
- !aDest.SetCapacity(new_length.value(), aFallible)) {
- return false;
- }
-
- LossyAppendUTF16toASCII(aSource, aDest);
- return true;
- }
-
- nsAString::const_iterator source_start, source_end;
- CalculateUTF8Size calculator;
- aSource.BeginReading(source_start);
- aSource.EndReading(source_end);
-
- // Skip the characters that we know are single byte.
- source_start.advance(firstNonASCII);
-
- copy_string(source_start,
- source_end, calculator);
-
- // Include the ASCII characters that were skipped in the count.
- size_t count = calculator.Size() + firstNonASCII;
-
- if (count) {
- auto old_dest_length = aDest.Length();
- // Grow the buffer if we need to.
- mozilla::CheckedInt<nsACString::size_type> new_length(count);
- new_length += old_dest_length;
-
- if (!new_length.isValid() ||
- !aDest.SetLength(new_length.value(), aFallible)) {
- return false;
- }
-
- // All ready? Time to convert
-
- nsAString::const_iterator ascii_end;
- aSource.BeginReading(ascii_end);
-
- if (firstNonASCII >= static_cast<int32_t>(kFastPathMinLength)) {
- // Use the more efficient lossy converter for the ASCII portion.
- LossyConvertEncoding16to8 lossy_converter(
- aDest.BeginWriting() + old_dest_length);
- nsAString::const_iterator ascii_start;
- aSource.BeginReading(ascii_start);
- ascii_end.advance(firstNonASCII);
-
- copy_string(ascii_start, ascii_end, lossy_converter);
- } else {
- // Not using the lossy shortcut, we need to include the leading ASCII
- // chars.
- firstNonASCII = 0;
- }
-
- ConvertUTF16toUTF8 converter(
- aDest.BeginWriting() + old_dest_length + firstNonASCII);
- copy_string(ascii_end,
- aSource.EndReading(source_end), converter);
-
- NS_ASSERTION(converter.Size() == count - firstNonASCII,
- "Unexpected disparity between CalculateUTF8Size and "
- "ConvertUTF16toUTF8");
- }
-
- return true;
-}
-
-void
-AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest)
-{
- if (!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible)) {
- aDest.AllocFailed(aDest.Length() + aSource.Length());
- }
-}
-
-bool
-AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest,
- const mozilla::fallible_t& aFallible)
-{
- nsACString::const_iterator source_start, source_end;
- CalculateUTF8Length calculator;
- copy_string(aSource.BeginReading(source_start),
- aSource.EndReading(source_end), calculator);
-
- uint32_t count = calculator.Length();
-
- // Avoid making the string mutable if we're appending an empty string
- if (count) {
- uint32_t old_dest_length = aDest.Length();
-
- // Grow the buffer if we need to.
- if (!aDest.SetLength(old_dest_length + count, aFallible)) {
- return false;
- }
-
- // All ready? Time to convert
-
- ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length);
- copy_string(aSource.BeginReading(source_start),
- aSource.EndReading(source_end), converter);
-
- NS_ASSERTION(converter.ErrorEncountered() ||
- converter.Length() == count,
- "CalculateUTF8Length produced the wrong length");
-
- if (converter.ErrorEncountered()) {
- NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
- aDest.SetLength(old_dest_length);
- }
- }
-
- return true;
-}
-
-void
-AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest)
-{
- if (aSource) {
- AppendUTF16toUTF8(nsDependentString(aSource), aDest);
- }
-}
-
-void
-AppendUTF8toUTF16(const char* aSource, nsAString& aDest)
-{
- if (aSource) {
- AppendUTF8toUTF16(nsDependentCString(aSource), aDest);
- }
-}
-
+using mozilla::MakeSpan;
/**
* A helper function that allocates a buffer of the desired character type big enough to hold a copy of the supplied string (plus a zero terminator).
*
* @param aSource an string you will eventually be making a copy of
* @return a new buffer (of the type specified by the second parameter) which you must free with |free|.
*
*/
template <class FromStringT, class ToCharT>
inline
ToCharT*
AllocateStringCopy(const FromStringT& aSource, ToCharT*)
{
- return static_cast<ToCharT*>(moz_xmalloc(
- (aSource.Length() + 1) * sizeof(ToCharT)));
+ // Can't overflow due to the definition of nsTSubstring<T>::kMaxCapacity
+ return static_cast<ToCharT*>(
+ moz_xmalloc((size_t(aSource.Length()) + 1) * sizeof(ToCharT)));
}
char*
ToNewCString(const nsAString& aSource)
{
- char* result = AllocateStringCopy(aSource, (char*)0);
- if (!result) {
+ char* dest = AllocateStringCopy(aSource, (char*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsAString::const_iterator fromBegin, fromEnd;
- LossyConvertEncoding16to8 converter(result);
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter).write_terminator();
- return result;
+ auto len = aSource.Length();
+ LossyConvertUTF16toLatin1(aSource, MakeSpan(dest, len));
+ dest[len] = 0;
+ return dest;
}
char*
ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count)
{
- nsAString::const_iterator start, end;
- CalculateUTF8Size calculator;
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- calculator);
-
- if (aUTF8Count) {
- *aUTF8Count = calculator.Size();
+ auto len = aSource.Length();
+ // The uses of this function seem temporary enough that it's not
+ // worthwhile to be fancy about the allocation size. Let's just use
+ // the worst case.
+ // Times 3 plus 2, because ConvertUTF16toUTF8 requires times 3 plus 1 and
+ // then we have the terminator.
+ mozilla::CheckedInt<size_t> destLen(len);
+ destLen *= 3;
+ destLen += 2;
+ if (!destLen.isValid()) {
+ return nullptr;
}
-
- char* result = static_cast<char*>
- (moz_xmalloc(calculator.Size() + 1));
- if (!result) {
+ size_t destLenVal = destLen.value();
+ if (destLenVal > UINT32_MAX) {
+ return nullptr;
+ }
+ char* dest = static_cast<char*>(moz_xmalloc(destLenVal));
+ if (!dest) {
return nullptr;
}
- ConvertUTF16toUTF8 converter(result);
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- converter).write_terminator();
- NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
+ size_t written = ConvertUTF16toUTF8(aSource, MakeSpan(dest, destLenVal));
+ dest[written] = 0;
- return result;
+ if (aUTF8Count) {
+ *aUTF8Count = written;
+ }
+
+ return dest;
}
char*
ToNewCString(const nsACString& aSource)
{
// no conversion needed, just allocate a buffer of the correct length and copy into it
- char* result = AllocateStringCopy(aSource, (char*)0);
- if (!result) {
+ char* dest = AllocateStringCopy(aSource, (char*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsACString::const_iterator fromBegin, fromEnd;
- char* toBegin = result;
- *copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- toBegin) = char(0);
- return result;
+ auto len = aSource.Length();
+ memcpy(dest, aSource.BeginReading(), len * sizeof(char));
+ dest[len] = 0;
+ return dest;
}
char16_t*
ToNewUnicode(const nsAString& aSource)
{
// no conversion needed, just allocate a buffer of the correct length and copy into it
- char16_t* result = AllocateStringCopy(aSource, (char16_t*)0);
- if (!result) {
+ char16_t* dest = AllocateStringCopy(aSource, (char16_t*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsAString::const_iterator fromBegin, fromEnd;
- char16_t* toBegin = result;
- *copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- toBegin) = char16_t(0);
- return result;
+ auto len = aSource.Length();
+ memcpy(dest, aSource.BeginReading(), len * sizeof(char16_t));
+ dest[len] = 0;
+ return dest;
}
char16_t*
ToNewUnicode(const nsACString& aSource)
{
- char16_t* result = AllocateStringCopy(aSource, (char16_t*)0);
- if (!result) {
+ char16_t* dest = AllocateStringCopy(aSource, (char16_t*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsACString::const_iterator fromBegin, fromEnd;
- LossyConvertEncoding8to16 converter(result);
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter).write_terminator();
- return result;
-}
-
-uint32_t
-CalcUTF8ToUnicodeLength(const nsACString& aSource)
-{
- nsACString::const_iterator start, end;
- CalculateUTF8Length calculator;
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- calculator);
- return calculator.Length();
-}
-
-char16_t*
-UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer,
- uint32_t* aUTF16Count)
-{
- nsACString::const_iterator start, end;
- ConvertUTF8toUTF16 converter(aBuffer);
- copy_string(aSource.BeginReading(start),
- aSource.EndReading(end),
- converter).write_terminator();
- if (aUTF16Count) {
- *aUTF16Count = converter.Length();
- }
- return aBuffer;
+ auto len = aSource.Length();
+ ConvertLatin1toUTF16(aSource, MakeSpan(dest, len));
+ dest[len] = 0;
+ return dest;
}
char16_t*
UTF8ToNewUnicode(const nsACString& aSource, uint32_t* aUTF16Count)
{
- const uint32_t length = CalcUTF8ToUnicodeLength(aSource);
- const size_t buffer_size = (length + 1) * sizeof(char16_t);
- char16_t* buffer = static_cast<char16_t*>(moz_xmalloc(buffer_size));
- if (!buffer) {
+ // Compute length plus one as required by ConvertUTF8toUTF16
+ uint32_t lengthPlusOne = aSource.Length() + 1; // Can't overflow
+
+ mozilla::CheckedInt<size_t> allocLength(lengthPlusOne);
+ // Add space for zero-termination
+ allocLength += 1;
+ // We need UTF-16 units
+ allocLength *= sizeof(char16_t);
+
+ if (!allocLength.isValid()) {
return nullptr;
}
- uint32_t copied;
- UTF8ToUnicodeBuffer(aSource, buffer, &copied);
- NS_ASSERTION(length == copied, "length mismatch");
+ char16_t* dest = (char16_t*)moz_xmalloc(allocLength.value());
+ if (!dest) {
+ return nullptr;
+ }
+
+ size_t written = ConvertUTF8toUTF16(aSource, MakeSpan(dest, lengthPlusOne));
+ dest[written] = 0;
if (aUTF16Count) {
- *aUTF16Count = copied;
+ *aUTF16Count = written;
}
- return buffer;
+
+ return dest;
}
char16_t*
CopyUnicodeTo(const nsAString& aSource, uint32_t aSrcOffset, char16_t* aDest,
uint32_t aLength)
{
- nsAString::const_iterator fromBegin, fromEnd;
- char16_t* toBegin = aDest;
- copy_string(aSource.BeginReading(fromBegin).advance(int32_t(aSrcOffset)),
- aSource.BeginReading(fromEnd).advance(int32_t(aSrcOffset + aLength)),
- toBegin);
+ MOZ_ASSERT(aSrcOffset + aLength <= aSource.Length());
+ memcpy(aDest,
+ aSource.BeginReading() + aSrcOffset,
+ size_t(aLength) * sizeof(char16_t));
return aDest;
}
void
-CopyUnicodeTo(const nsAString::const_iterator& aSrcStart,
- const nsAString::const_iterator& aSrcEnd,
- nsAString& aDest)
-{
- aDest.SetLength(Distance(aSrcStart, aSrcEnd));
-
- nsAString::char_iterator dest = aDest.BeginWriting();
- nsAString::const_iterator fromBegin(aSrcStart);
-
- copy_string(fromBegin, aSrcEnd, dest);
-}
-
-void
-AppendUnicodeTo(const nsAString::const_iterator& aSrcStart,
- const nsAString::const_iterator& aSrcEnd,
- nsAString& aDest)
-{
- uint32_t oldLength = aDest.Length();
- aDest.SetLength(oldLength + Distance(aSrcStart, aSrcEnd));
-
- nsAString::char_iterator dest = aDest.BeginWriting() + oldLength;
- nsAString::const_iterator fromBegin(aSrcStart);
-
- copy_string(fromBegin, aSrcEnd, dest);
-}
-
-bool
-IsASCII(const nsAString& aString)
-{
- static const char16_t NOT_ASCII = char16_t(~0x007F);
-
-
- // Don't want to use |copy_string| for this task, since we can stop at the first non-ASCII character
-
- nsAString::const_iterator iter, done_reading;
- aString.BeginReading(iter);
- aString.EndReading(done_reading);
-
- const char16_t* c = iter.get();
- const char16_t* end = done_reading.get();
-
- while (c < end) {
- if (*c++ & NOT_ASCII) {
- return false;
- }
- }
-
- return true;
-}
-
-/**
- * A character sink for in-place case conversion.
- */
-class ConvertToUpperCase
-{
-public:
- typedef char value_type;
-
- uint32_t
- write(const char* aSource, uint32_t aSourceLength)
- {
- char* cp = const_cast<char*>(aSource);
- const char* end = aSource + aSourceLength;
- while (cp != end) {
- char ch = *cp;
- if (ch >= 'a' && ch <= 'z') {
- *cp = ch - ('a' - 'A');
- }
- ++cp;
- }
- return aSourceLength;
- }
-};
-
-void
ToUpperCase(nsACString& aCString)
{
- ConvertToUpperCase converter;
- char* start;
- converter.write(aCString.BeginWriting(start), aCString.Length());
-}
-
-/**
- * A character sink for copying with case conversion.
- */
-class CopyToUpperCase
-{
-public:
- typedef char value_type;
-
- explicit CopyToUpperCase(nsACString::iterator& aDestIter,
- const nsACString::iterator& aEndIter)
- : mIter(aDestIter)
- , mEnd(aEndIter)
- {
+ char* cp = aCString.BeginWriting();
+ char* end = cp + aCString.Length();
+ while (cp != end) {
+ char ch = *cp;
+ if (ch >= 'a' && ch <= 'z') {
+ *cp = ch - ('a' - 'A');
+ }
+ ++cp;
}
-
- uint32_t
- write(const char* aSource, uint32_t aSourceLength)
- {
- uint32_t len = XPCOM_MIN(uint32_t(mEnd - mIter), aSourceLength);
- char* cp = mIter.get();
- const char* end = aSource + len;
- while (aSource != end) {
- char ch = *aSource;
- if ((ch >= 'a') && (ch <= 'z')) {
- *cp = ch - ('a' - 'A');
- } else {
- *cp = ch;
- }
- ++aSource;
- ++cp;
- }
- mIter.advance(len);
- return len;
- }
-
-protected:
- nsACString::iterator& mIter;
- const nsACString::iterator& mEnd;
-};
+}
void
ToUpperCase(const nsACString& aSource, nsACString& aDest)
{
- nsACString::const_iterator fromBegin, fromEnd;
- nsACString::iterator toBegin, toEnd;
aDest.SetLength(aSource.Length());
-
- CopyToUpperCase converter(aDest.BeginWriting(toBegin), aDest.EndWriting(toEnd));
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter);
+ const char* src = aSource.BeginReading();
+ const char* end = src + aSource.Length();
+ char* dst = aDest.BeginWriting();
+ while (src != end) {
+ char ch = *src;
+ if (ch >= 'a' && ch <= 'z') {
+ *dst = ch - ('a' - 'A');
+ } else {
+ *dst = ch;
+ }
+ ++src;
+ ++dst;
+ }
}
-/**
- * A character sink for case conversion.
- */
-class ConvertToLowerCase
-{
-public:
- typedef char value_type;
-
- uint32_t
- write(const char* aSource, uint32_t aSourceLength)
- {
- char* cp = const_cast<char*>(aSource);
- const char* end = aSource + aSourceLength;
- while (cp != end) {
- char ch = *cp;
- if ((ch >= 'A') && (ch <= 'Z')) {
- *cp = ch + ('a' - 'A');
- }
- ++cp;
- }
- return aSourceLength;
- }
-};
-
void
ToLowerCase(nsACString& aCString)
{
- ConvertToLowerCase converter;
- char* start;
- converter.write(aCString.BeginWriting(start), aCString.Length());
-}
-
-/**
- * A character sink for copying with case conversion.
- */
-class CopyToLowerCase
-{
-public:
- typedef char value_type;
-
- explicit CopyToLowerCase(nsACString::iterator& aDestIter,
- const nsACString::iterator& aEndIter)
- : mIter(aDestIter)
- , mEnd(aEndIter)
- {
+ char* cp = aCString.BeginWriting();
+ char* end = cp + aCString.Length();
+ while (cp != end) {
+ char ch = *cp;
+ if (ch >= 'A' && ch <= 'Z') {
+ *cp = ch + ('a' - 'A');
+ }
+ ++cp;
}
-
- uint32_t
- write(const char* aSource, uint32_t aSourceLength)
- {
- uint32_t len = XPCOM_MIN(uint32_t(mEnd - mIter), aSourceLength);
- char* cp = mIter.get();
- const char* end = aSource + len;
- while (aSource != end) {
- char ch = *aSource;
- if ((ch >= 'A') && (ch <= 'Z')) {
- *cp = ch + ('a' - 'A');
- } else {
- *cp = ch;
- }
- ++aSource;
- ++cp;
- }
- mIter.advance(len);
- return len;
- }
-
-protected:
- nsACString::iterator& mIter;
- const nsACString::iterator& mEnd;
-};
+}
void
ToLowerCase(const nsACString& aSource, nsACString& aDest)
{
- nsACString::const_iterator fromBegin, fromEnd;
- nsACString::iterator toBegin, toEnd;
aDest.SetLength(aSource.Length());
-
- CopyToLowerCase converter(aDest.BeginWriting(toBegin), aDest.EndWriting(toEnd));
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter);
+ const char* src = aSource.BeginReading();
+ const char* end = src + aSource.Length();
+ char* dst = aDest.BeginWriting();
+ while (src != end) {
+ char ch = *src;
+ if (ch >= 'A' && ch <= 'Z') {
+ *dst = ch + ('a' - 'A');
+ } else {
+ *dst = ch;
+ }
+ ++src;
+ ++dst;
+ }
}
bool
ParseString(const nsACString& aSource, char aDelimiter,
nsTArray<nsCString>& aArray)
{
nsACString::const_iterator start, end;
aSource.BeginReading(start);
@@ -1178,117 +631,56 @@ VoidCString()
{
static const nsCString sNull(mozilla::detail::StringDataFlags::VOIDED);
return sNull;
}
int32_t
CompareUTF8toUTF16(const nsACString& aUTF8String,
- const nsAString& aUTF16String)
+ const nsAString& aUTF16String,
+ bool* aErr)
{
- static const uint32_t NOT_ASCII = uint32_t(~0x7F);
-
const char* u8;
const char* u8end;
aUTF8String.BeginReading(u8);
aUTF8String.EndReading(u8end);
const char16_t* u16;
const char16_t* u16end;
aUTF16String.BeginReading(u16);
aUTF16String.EndReading(u16end);
- while (u8 != u8end && u16 != u16end) {
- // Cast away the signedness of *u8 to prevent signextension when
- // converting to uint32_t
- uint32_t c8_32 = (uint8_t)*u8;
-
- if (c8_32 & NOT_ASCII) {
- bool err;
- c8_32 = UTF8CharEnumerator::NextChar(&u8, u8end, &err);
- if (err) {
- return INT32_MIN;
+ for (;;) {
+ if (u8 == u8end) {
+ if (u16 == u16end) {
+ return 0;
}
-
- uint32_t c16_32 = UTF16CharEnumerator::NextChar(&u16, u16end);
- // The above UTF16CharEnumerator::NextChar() calls can
- // fail, but if it does for anything other than no data to
- // look at (which can't happen here), it returns the
- // Unicode replacement character 0xFFFD for the invalid
- // data they were fed. Ignore that error and treat invalid
- // UTF16 as 0xFFFD.
- //
- // This matches what our UTF16 to UTF8 conversion code
- // does, and thus a UTF8 string that came from an invalid
- // UTF16 string will compare equal to the invalid UTF16
- // string it came from. Same is true for any other UTF16
- // string differs only in the invalid part of the string.
-
- if (c8_32 != c16_32) {
- return c8_32 < c16_32 ? -1 : 1;
- }
- } else {
- if (c8_32 != *u16) {
- return c8_32 > *u16 ? 1 : -1;
- }
-
- ++u8;
- ++u16;
+ return -1;
+ }
+ if (u16 == u16end) {
+ return 1;
}
- }
-
- if (u8 != u8end) {
- // We get to the end of the UTF16 string, but no to the end of
- // the UTF8 string. The UTF8 string is longer than the UTF16
- // string
-
+ // No need for ASCII optimization, since both NextChar()
+ // calls get inlined.
+ uint32_t scalar8 = UTF8CharEnumerator::NextChar(&u8, u8end, aErr);
+ uint32_t scalar16 = UTF16CharEnumerator::NextChar(&u16, u16end, aErr);
+ if (scalar16 == scalar8) {
+ continue;
+ }
+ if (scalar8 < scalar16) {
+ return -1;
+ }
return 1;
}
-
- if (u16 != u16end) {
- // We get to the end of the UTF8 string, but no to the end of
- // the UTF16 string. The UTF16 string is longer than the UTF8
- // string
-
- return -1;
- }
-
- // The two strings match.
-
- return 0;
}
void
AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest)
{
NS_ASSERTION(IS_VALID_CHAR(aSource), "Invalid UCS4 char");
if (IS_IN_BMP(aSource)) {
aDest.Append(char16_t(aSource));
} else {
aDest.Append(H_SURROGATE(aSource));
aDest.Append(L_SURROGATE(aSource));
}
}
-
-extern "C" {
-
-void Gecko_AppendUTF16toCString(nsACString* aThis, const nsAString* aOther)
-{
- AppendUTF16toUTF8(*aOther, *aThis);
-}
-
-void Gecko_AppendUTF8toString(nsAString* aThis, const nsACString* aOther)
-{
- AppendUTF8toUTF16(*aOther, *aThis);
-}
-
-bool Gecko_FallibleAppendUTF16toCString(nsACString* aThis, const nsAString* aOther)
-{
- return AppendUTF16toUTF8(*aOther, *aThis, mozilla::fallible);
-}
-
-bool Gecko_FallibleAppendUTF8toString(nsAString* aThis, const nsACString* aOther)
-{
- return AppendUTF8toUTF16(*aOther, *aThis, mozilla::fallible);
-}
-
-}
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -15,107 +15,391 @@
#include "mozilla/Assertions.h"
#include "nsAString.h"
#include "nsTArrayForwardDeclare.h"
// Can't include mozilla/Encoding.h here
extern "C" {
- size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
- size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+ size_t
+ encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+
+ bool
+ encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len);
+
+ bool
+ encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
+
+ bool
+ encoding_mem_is_utf8_latin1(uint8_t const* buffer, size_t buffer_len);
+
+ bool
+ encoding_mem_is_str_latin1(uint8_t const* buffer, size_t buffer_len);
+
+ bool
+ encoding_mem_is_utf16_latin1(char16_t const* buffer, size_t buffer_len);
+
+ void
+ encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src,
+ size_t src_len,
+ char* dst,
+ size_t dst_len);
+
+ size_t
+ encoding_mem_convert_utf8_to_latin1_lossy(const char* src,
+ size_t src_len,
+ char* dst,
+ size_t dst_len);
+
+ void
+ encoding_mem_convert_latin1_to_utf16(const char* src,
+ size_t src_len,
+ char16_t* dst,
+ size_t dst_len);
+
+ size_t
+ encoding_mem_convert_utf16_to_utf8(const char16_t* src,
+ size_t src_len,
+ char* dst,
+ size_t dst_len);
+
+ size_t
+ encoding_mem_convert_utf8_to_utf16(const char* src,
+ size_t src_len,
+ char16_t* dst,
+ size_t dst_len);
+}
+
+// From the nsstring crate
+extern "C" {
+ bool
+ nsstring_fallible_append_utf8_impl(nsAString* aThis,
+ const char* aOther,
+ size_t aOtherLen,
+ size_t aOldLen);
+
+ bool
+ nsstring_fallible_append_latin1_impl(nsAString* aThis,
+ const char* aOther,
+ size_t aOtherLen,
+ size_t aOldLen);
+
+ bool
+ nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis,
+ const char16_t*,
+ size_t aOtherLen,
+ size_t aOldLen);
+
+ bool
+ nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis,
+ const char16_t*,
+ size_t aOtherLen,
+ size_t aOldLen);
+
+ bool
+ nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis,
+ const nsACString* aOther,
+ size_t aOldLen);
+
+ bool
+ nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis,
+ const nsACString* aOther,
+ size_t aOldLen);
+}
+
+/**
+ * If all the code points in the input are below U+0100, converts to Latin1,
+ * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
+ * there are code points above U+00FF, asserts in debug builds and produces
+ * garbage in release builds. The nature of the garbage depends on the CPU
+ * architecture and must not be relied upon.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline void
+LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource,
+ mozilla::Span<char> aDest)
+{
+ encoding_mem_convert_utf16_to_latin1_lossy(
+ aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * If all the code points in the input are below U+0100, converts to Latin1,
+ * i.e. unsigned byte value is Unicode scalar value; not windows-1252. If
+ * there are code points above U+00FF, asserts in debug builds and produces
+ * garbage in release builds. The nature of the garbage may depend on the CPU
+ * architecture and must not be relied upon.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline size_t
+LossyConvertUTF8toLatin1(mozilla::Span<const char> aSource,
+ mozilla::Span<char> aDest)
+{
+ return encoding_mem_convert_utf8_to_latin1_lossy(
+ aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Interprets unsigned byte value as Unicode scalar value (i.e. not
+ * windows-1252!).
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline void
+ConvertLatin1toUTF16(mozilla::Span<const char> aSource,
+ mozilla::Span<char16_t> aDest)
+{
+ encoding_mem_convert_latin1_to_utf16(
+ aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Lone surrogates are replaced with the REPLACEMENT CHARACTER.
+ *
+ * The length of aDest must be at least the length of aSource times three
+ * _plus one_.
+ *
+ * Returns the number of code units written.
+ */
+inline size_t
+ConvertUTF16toUTF8(mozilla::Span<const char16_t> aSource,
+ mozilla::Span<char> aDest)
+{
+ return encoding_mem_convert_utf16_to_utf8(
+ aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Malformed byte sequences are replaced with the REPLACEMENT CHARACTER.
+ *
+ * The length of aDest must at least one greater than the length of aSource.
+ *
+ * Returns the number of code units written.
+ */
+inline size_t
+ConvertUTF8toUTF16(mozilla::Span<const char> aSource,
+ mozilla::Span<char16_t> aDest)
+{
+ return encoding_mem_convert_utf8_to_utf16(
+ aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}
inline size_t
Distance(const nsReadingIterator<char16_t>& aStart,
const nsReadingIterator<char16_t>& aEnd)
{
MOZ_ASSERT(aStart.get() <= aEnd.get());
return static_cast<size_t>(aEnd.get() - aStart.get());
}
+
inline size_t
Distance(const nsReadingIterator<char>& aStart,
const nsReadingIterator<char>& aEnd)
{
MOZ_ASSERT(aStart.get() <= aEnd.get());
return static_cast<size_t>(aEnd.get() - aStart.get());
}
-void LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest);
-void CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
-MOZ_MUST_USE bool CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
- const mozilla::fallible_t&);
+// UTF-8 to UTF-16
+// Invalid UTF-8 byte sequences are replaced with the REPLACEMENT CHARACTER.
+
+inline MOZ_MUST_USE bool
+CopyUTF8toUTF16(mozilla::Span<const char> aSource,
+ nsAString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nsstring_fallible_append_utf8_impl(
+ &aDest, aSource.Elements(), aSource.Length(), 0);
+}
-void LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
-void CopyASCIItoUTF16(const char* aSource, nsAString& aDest);
+inline void
+CopyUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
+{
+ if (MOZ_UNLIKELY(!CopyUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aSource.Length());
+ }
+}
+
+inline MOZ_MUST_USE bool
+AppendUTF8toUTF16(mozilla::Span<const char> aSource,
+ nsAString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nsstring_fallible_append_utf8_impl(
+ &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
+}
+
+inline void
+AppendUTF8toUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
+{
+ if (MOZ_UNLIKELY(!AppendUTF8toUTF16(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aDest.Length() + aSource.Length());
+ }
+}
-void CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
-MOZ_MUST_USE bool CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
- const mozilla::fallible_t&);
-void CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
+// Latin1 to UTF-16
+// Interpret each incoming unsigned byte value as a Unicode scalar value (not
+// windows-1252!). The function names say "ASCII" instead of "Latin1" for
+// legacy reasons.
+
+inline MOZ_MUST_USE bool
+CopyASCIItoUTF16(mozilla::Span<const char> aSource,
+ nsAString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nsstring_fallible_append_latin1_impl(
+ &aDest, aSource.Elements(), aSource.Length(), 0);
+}
-void CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
-void CopyUTF8toUTF16(const char* aSource, nsAString& aDest);
+inline void
+CopyASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
+{
+ if (MOZ_UNLIKELY(!CopyASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aSource.Length());
+ }
+}
+
+inline MOZ_MUST_USE bool
+AppendASCIItoUTF16(mozilla::Span<const char> aSource,
+ nsAString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nsstring_fallible_append_latin1_impl(
+ &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
+}
+
+inline void
+AppendASCIItoUTF16(mozilla::Span<const char> aSource, nsAString& aDest)
+{
+ if (MOZ_UNLIKELY(!AppendASCIItoUTF16(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aDest.Length() + aSource.Length());
+ }
+}
-void LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest);
-void AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
-MOZ_MUST_USE bool AppendASCIItoUTF16(const nsACString& aSource,
- nsAString& aDest,
- const mozilla::fallible_t&);
+// UTF-16 to UTF-8
+// Unpaired surrogates are replaced with the REPLACEMENT CHARACTER.
+
+inline MOZ_MUST_USE bool
+CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource,
+ nsACString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nscstring_fallible_append_utf16_to_utf8_impl(
+ &aDest, aSource.Elements(), aSource.Length(), 0);
+}
-void LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
-MOZ_MUST_USE bool AppendASCIItoUTF16(const char* aSource,
- nsAString& aDest,
- const mozilla::fallible_t&);
-void AppendASCIItoUTF16(const char* aSource, nsAString& aDest);
+inline void
+CopyUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
+{
+ if (MOZ_UNLIKELY(!CopyUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aSource.Length());
+ }
+}
+
+inline MOZ_MUST_USE bool
+AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource,
+ nsACString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nscstring_fallible_append_utf16_to_utf8_impl(
+ &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
+}
+
+inline void
+AppendUTF16toUTF8(mozilla::Span<const char16_t> aSource, nsACString& aDest)
+{
+ if (MOZ_UNLIKELY(!AppendUTF16toUTF8(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aDest.Length() + aSource.Length());
+ }
+}
-void AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
-MOZ_MUST_USE bool AppendUTF16toUTF8(const nsAString& aSource,
- nsACString& aDest,
- const mozilla::fallible_t&);
-void AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
-MOZ_MUST_USE bool AppendUTF8toUTF16(const nsACString& aSource,
- nsAString& aDest,
- const mozilla::fallible_t&);
+// UTF-16 to Latin1
+// If all code points in the input are below U+0100, represents each scalar
+// value as an unsigned byte. (This is not windows-1252!) If there are code
+// points above U+00FF, asserts in debug builds and memory-safely produces
+// garbage in release builds. The nature of the garbage may differ based on
+// CPU architecture and must not be relied upon. The names say "ASCII" instead
+// of "Latin1" for legacy reasons.
+
+inline MOZ_MUST_USE bool
+LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource,
+ nsACString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
+ &aDest, aSource.Elements(), aSource.Length(), 0);
+}
-void AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
-void AppendUTF8toUTF16(const char* aSource, nsAString& aDest);
+inline void
+LossyCopyUTF16toASCII(mozilla::Span<const char16_t> aSource, nsACString& aDest)
+{
+ if (MOZ_UNLIKELY(!LossyCopyUTF16toASCII(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aSource.Length());
+ }
+}
+
+inline MOZ_MUST_USE bool
+LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
+ nsACString& aDest,
+ const mozilla::fallible_t&)
+{
+ return nscstring_fallible_append_utf16_to_latin1_lossy_impl(
+ &aDest, aSource.Elements(), aSource.Length(), aDest.Length());
+}
+
+inline void
+LossyAppendUTF16toASCII(mozilla::Span<const char16_t> aSource,
+ nsACString& aDest)
+{
+ if (MOZ_UNLIKELY(
+ !LossyAppendUTF16toASCII(aSource, aDest, mozilla::fallible))) {
+ aDest.AllocFailed(aDest.Length() + aSource.Length());
+ }
+}
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with |free|.
- * Performs a lossy encoding conversion by chopping 16-bit wide characters down to 8-bits wide while copying |aSource| to your new buffer.
- * This conversion is not well defined; but it reproduces legacy string behavior.
- * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
+ * Performs a conversion with LossyConvertUTF16toLatin1() writing into the
+ * newly-allocated buffer.
+ *
+ * The new buffer is zero-terminated, but that may not help you if |aSource|
+ * contains embedded nulls.
*
* @param aSource a 16-bit wide string
* @return a new |char| buffer you must free with |free|.
*/
char* ToNewCString(const nsAString& aSource);
-
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with |free|.
- * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
+ *
+ * The new buffer is zero-terminated, but that may not help you if |aSource|
+ * contains embedded nulls.
*
* @param aSource an 8-bit wide string
* @return a new |char| buffer you must free with |free|.
*/
char* ToNewCString(const nsACString& aSource);
/**
* Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |free|.
- * Performs an encoding conversion from a UTF-16 string to a UTF-8 string
- * copying |aSource| to your new buffer.
+ * Performs an encoding conversion from a UTF-16 string to a UTF-8 string with
+ * unpaired surrogates replaced with the REPLACEMENT CHARACTER copying
+ * |aSource| to your new buffer.
+ *
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource a UTF-16 string (made of char16_t's)
* @param aUTF8Count the number of 8-bit units that was returned
* @return a new |char| buffer you must free with |free|.
*/
@@ -123,79 +407,56 @@ char* ToNewUTF8String(const nsAString& a
/**
* Returns a new |char16_t| buffer containing a zero-terminated copy of
* |aSource|.
*
* Allocates and returns a new |char16_t| buffer which you must free with
* |free|.
+ *
* The new buffer is zero-terminated, but that may not help you if |aSource|
* contains embedded nulls.
*
* @param aSource a UTF-16 string
* @return a new |char16_t| buffer you must free with |free|.
*/
char16_t* ToNewUnicode(const nsAString& aSource);
/**
- * Returns a new |char16_t| buffer containing a zero-terminated copy of |aSource|.
+ * Returns a new |char16_t| buffer containing a zero-terminated copy of
+ * |aSource|.
+ *
+ * Allocates and returns a new |char16_t| buffer which you must free with
+ * |free|.
*
- * Allocates and returns a new |char16_t| buffer which you must free with |free|.
- * Performs an encoding conversion by 0-padding 8-bit wide characters up to 16-bits wide while copying |aSource| to your new buffer.
- * This conversion is not well defined; but it reproduces legacy string behavior.
- * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
+ * Performs an encoding conversion by 0-padding 8-bit wide characters up to
+ * 16-bits wide (i.e. Latin1 to UTF-16 conversion) while copying |aSource|
+ * to your new buffer.
*
- * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
+ * The new buffer is zero-terminated, but that may not help you if |aSource|
+ * contains embedded nulls.
+ *
+ * @param aSource a Latin1 string
* @return a new |char16_t| buffer you must free with |free|.
*/
char16_t* ToNewUnicode(const nsACString& aSource);
/**
- * Returns the required length for a char16_t buffer holding
- * a copy of aSource, using UTF-8 to UTF-16 conversion.
- * The length does NOT include any space for zero-termination.
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @return length of UTF-16 encoded string copy, not zero-terminated
- */
-uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
-
-/**
- * Copies the source string into the specified buffer, converting UTF-8 to
- * UTF-16 in the process. The conversion is well defined for valid UTF-8
- * strings.
- * The copied string will be zero-terminated! Any embedded nulls will be
- * copied nonetheless. It is the caller's responsiblity to ensure the buffer
- * is large enough to hold the string copy plus one char16_t for
- * zero-termination!
- *
- * @see CalcUTF8ToUnicodeLength( const nsACString& )
- * @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
- *
- * @param aSource an 8-bit wide string, UTF-8 encoded
- * @param aBuffer the buffer holding the converted string copy
- * @param aUTF16Count receiving optionally the number of 16-bit units that
- * were copied
- * @return aBuffer pointer, for convenience
- */
-char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource,
- char16_t* aBuffer,
- uint32_t* aUTF16Count = nullptr);
-
-/**
* Returns a new |char16_t| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |free|. Performs an encoding conversion from UTF-8 to UTF-16
- * while copying |aSource| to your new buffer. This conversion is well defined
- * for a valid UTF-8 string. The new buffer is zero-terminated, but that
- * may not help you if |aSource| contains embedded nulls.
+ * while copying |aSource| to your new buffer. Malformed byte sequences
+ * are replaced with the REPLACEMENT CHARACTER.
+ *
+ * The new buffer is zero-terminated, but that may not help you if |aSource|
+ * contains embedded nulls.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @param aUTF16Count the number of 16-bit units that was returned
* @return a new |char16_t| buffer you must free with |free|.
* (UTF-16 encoded)
*/
char16_t* UTF8ToNewUnicode(const nsACString& aSource,
uint32_t* aUTF16Count = nullptr);
@@ -212,99 +473,163 @@ char16_t* UTF8ToNewUnicode(const nsACStr
* @param aLength the number of 16-bit code units to copy
* @return pointer to destination buffer - identical to |aDest|
*/
char16_t* CopyUnicodeTo(const nsAString& aSource,
uint32_t aSrcOffset,
char16_t* aDest,
uint32_t aLength);
-
/**
- * Copies 16-bit characters between iterators |aSrcStart| and
- * |aSrcEnd| to the writable string |aDest|. Similar to the
- * |nsString::Mid| method.
- *
- * After this operation |aDest| is not null terminated.
- *
- * @param aSrcStart start source iterator
- * @param aSrcEnd end source iterator
- * @param aDest destination for the copy
- */
-void CopyUnicodeTo(const nsAString::const_iterator& aSrcStart,
- const nsAString::const_iterator& aSrcEnd,
- nsAString& aDest);
-
-/**
- * Appends 16-bit characters between iterators |aSrcStart| and
- * |aSrcEnd| to the writable string |aDest|.
- *
- * After this operation |aDest| is not null terminated.
- *
- * @param aSrcStart start source iterator
- * @param aSrcEnd end source iterator
- * @param aDest destination for the copy
- */
-void AppendUnicodeTo(const nsAString::const_iterator& aSrcStart,
- const nsAString::const_iterator& aSrcEnd,
- nsAString& aDest);
-
-/**
- * Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F).
+ * Returns |true| if |aString| contains only ASCII characters, that is,
+ * characters in the range (0x00, 0x7F).
*
* @param aString a 16-bit wide string to scan
*/
-bool IsASCII(const nsAString& aString);
+inline bool
+IsASCII(mozilla::Span<const char16_t> aString)
+{
+ size_t length = aString.Length();
+ const char16_t* ptr = aString.Elements();
+ // For short strings, calling into Rust is a pessimization, and the SIMD
+ // code won't have a chance to kick in anyway.
+ if (length < 16) {
+ char16_t accu = 0;
+ for (size_t i = 0; i < length; i++) {
+ accu |= ptr[i];
+ }
+ return accu < 0x80U;
+ }
+ return encoding_mem_is_basic_latin(ptr, length);
+}
/**
- * Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F).
+ * Returns |true| if |aString| contains only ASCII characters, that is,
+ * characters in the range (0x00, 0x7F).
*
* @param aString a 8-bit wide string to scan
*/
-inline bool IsASCII(const nsACString& aString)
+inline bool
+IsASCII(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
- const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
- // code won't have a chance to kick in anyway. Additionally, handling the
- // case of the empty string here makes null-checking ptr unnecessary.
- // (Passing nullptr to Rust would technically be UB.)
+ // code won't have a chance to kick in anyway.
if (length < 16) {
- size_t accu = 0;
+ uint8_t accu = 0;
+ for (size_t i = 0; i < length; i++) {
+ accu |= ptr[i];
+ }
+ return accu < 0x80U;
+ }
+ return encoding_mem_is_ascii(ptr, length);
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is,
+ * characters in the range (U+0000, U+00FF).
+ *
+ * @param aString a potentially-invalid UTF-16 string to scan
+ */
+inline bool
+IsUTF16Latin1(mozilla::Span<const char16_t> aString)
+{
+ size_t length = aString.Length();
+ const char16_t* ptr = aString.Elements();
+ // For short strings, calling into Rust is a pessimization, and the SIMD
+ // code won't have a chance to kick in anyway.
+ if (length < 16) {
+ char16_t accu = 0;
for (size_t i = 0; i < length; i++) {
accu |= ptr[i];
}
- return accu < 0x80;
+ return accu < 0x100U;
+ }
+ return encoding_mem_is_utf16_latin1(ptr, length);
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is,
+ * characters in the range (U+0000, U+00FF).
+ *
+ * If you know that the argument is always absolutely guaranteed to be valid
+ * UTF-8, use the faster UnsafeIsValidUTF8Latin1() instead.
+ *
+ * @param aString potentially-invalid UTF-8 string to scan
+ */
+inline bool
+IsUTF8Latin1(mozilla::Span<const char> aString)
+{
+ size_t length = aString.Length();
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
+ // For short strings, calling into Rust is a pessimization, and the SIMD
+ // code won't have a chance to kick in anyway.
+ if (length < 16) {
+ for (size_t i = 0; i < length; i++) {
+ if (ptr[i] >= 0x80U) {
+ ptr += i;
+ length -= i;
+ goto end;
+ }
+ }
+ return true;
}
- // This is not quite optimal, because it's not fail-fast when the by-register
- // check already finds non-ASCII. Also, input to this function is almost
- // always ASCII, so even the by-register check wouldn't need to be fail-fast
- // and could be more like the loop above.
- return length == encoding_ascii_valid_up_to(ptr, length);
+end:
+ return encoding_mem_is_utf8_latin1(ptr, length);
+}
+
+/**
+ * Returns |true| if |aString| contains only Latin1 characters, that is,
+ * characters in the range (U+0000, U+00FF).
+ *
+ * The argument MUST be valid UTF-8. If you are at all unsure, use IsUTF8Latin1
+ * instead!
+ *
+ * @param aString known-valid UTF-8 string to scan
+ */
+inline bool
+UnsafeIsValidUTF8Latin1(mozilla::Span<const char> aString)
+{
+ size_t length = aString.Length();
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
+ // For short strings, calling into Rust is a pessimization, and the SIMD
+ // code won't have a chance to kick in anyway.
+ if (length < 16) {
+ for (size_t i = 0; i < length; i++) {
+ if (ptr[i] >= 0x80U) {
+ ptr += i;
+ length -= i;
+ goto end;
+ }
+ }
+ return true;
+ }
+end:
+ return encoding_mem_is_str_latin1(ptr, length);
}
/**
* Returns |true| if |aString| is a valid UTF-8 string.
*
* Note that this doesn't check whether the string might look like a valid
* string in another encoding, too, e.g. ISO-2022-JP.
*
* @param aString an 8-bit wide string to scan
*/
-inline bool IsUTF8(const nsACString& aString)
+inline bool
+IsUTF8(mozilla::Span<const char> aString)
{
size_t length = aString.Length();
- const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
// For short strings, calling into Rust is a pessimization, and the SIMD
- // code won't have a chance to kick in anyway. Additionally, handling the
- // case of the empty string here makes null-checking ptr unnecessary.
- // (Passing nullptr to Rust would technically be UB.)
+ // code won't have a chance to kick in anyway.
if (length < 16) {
for (size_t i = 0; i < length; i++) {
- if (ptr[i] >= 0x80) {
+ if (ptr[i] >= 0x80U) {
ptr += i;
length -= i;
goto end;
}
}
return true;
}
end:
@@ -328,22 +653,26 @@ void ToLowerCase(nsACString&);
/**
* Converts case from string aSource to aDest.
*/
void ToUpperCase(const nsACString& aSource, nsACString& aDest);
void ToLowerCase(const nsACString& aSource, nsACString& aDest);
/**
- * Finds the leftmost occurrence of |aPattern|, if any in the range |aSearchStart|..|aSearchEnd|.
+ * Finds the leftmost occurrence of |aPattern|, if any in the range
+ * |aSearchStart|..|aSearchEnd|.
*
- * Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to
- * point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|.
+ * Returns |true| if a match was found, and adjusts |aSearchStart| and
+ * |aSearchEnd| to point to the match. If no match was found, returns |false|
+ * and makes |aSearchStart == aSearchEnd|.
*
- * Currently, this is equivalent to the O(m*n) implementation previously on |ns[C]String|.
+ * Currently, this is equivalent to the O(m*n) implementation previously on
+ * |ns[C]String|.
+ *
* If we need something faster, then we can implement that later.
*/
bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
nsAString::const_iterator&,
const nsStringComparator& = nsDefaultStringComparator());
bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
nsACString::const_iterator&,
@@ -373,19 +702,19 @@ FindInReadable(const nsACString& aPatter
bool CaseInsensitiveFindInReadable(const nsACString& aPattern,
nsACString::const_iterator&,
nsACString::const_iterator&);
/**
* Finds the rightmost occurrence of |aPattern|
- * Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to
- * point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|.
- *
+ * Returns |true| if a match was found, and adjusts |aSearchStart| and
+ * |aSearchEnd| to point to the match. If no match was found, returns |false|
+ * and makes |aSearchStart == aSearchEnd|.
*/
bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
nsAString::const_iterator&,
const nsStringComparator& = nsDefaultStringComparator());
bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
nsACString::const_iterator&,
const nsCStringComparator& = nsDefaultCStringComparator());
@@ -425,25 +754,28 @@ bool StringEndsWith(const nsACString& aS
const nsString& EmptyString();
const nsCString& EmptyCString();
const nsString& VoidString();
const nsCString& VoidCString();
/**
-* Compare a UTF-8 string to an UTF-16 string.
-*
-* Returns 0 if the strings are equal, -1 if aUTF8String is less
-* than aUTF16Count, and 1 in the reverse case. In case of fatal
-* error (eg the strings are not valid UTF8 and UTF16 respectively),
-* this method will return INT32_MIN.
-*/
-int32_t CompareUTF8toUTF16(const nsACString& aUTF8String,
- const nsAString& aUTF16String);
+ * Compare a UTF-8 string to an UTF-16 string.
+ *
+ * Returns 0 if the strings are equal, -1 if aUTF8String is less
+ * than aUTF16Count, and 1 in the reverse case. Errors are replaced
+ * with U+FFFD and then the U+FFFD is compared as if it had occurred
+ * in the input. If aErr is not nullptr, *aErr is set to true if
+ * either string had malformed sequences.
+ */
+int32_t
+CompareUTF8toUTF16(const nsACString& aUTF8String,
+ const nsAString& aUTF16String,
+ bool* aErr = nullptr);
void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);
template<class T>
inline bool
EnsureStringLength(T& aStr, uint32_t aLen)
{
aStr.SetLength(aLen);
--- a/xpcom/string/nsSubstring.cpp
+++ b/xpcom/string/nsSubstring.cpp
@@ -454,16 +454,24 @@ char* Gecko_BeginWritingCString(nsACStri
return aThis->BeginWriting();
}
char* Gecko_FallibleBeginWritingCString(nsACString* aThis)
{
return aThis->BeginWriting(mozilla::fallible);
}
+uint32_t
+Gecko_StartBulkWriteCString(nsACString* aThis,
+ uint32_t aCapacity,
+ uint32_t aUnitsToPreserve)
+{
+ return aThis->StartBulkWrite(aCapacity, aUnitsToPreserve);
+}
+
void Gecko_FinalizeString(nsAString* aThis)
{
aThis->~nsAString();
}
void Gecko_AssignString(nsAString* aThis, const nsAString* aOther)
{
aThis->Assign(*aOther);
@@ -509,9 +517,17 @@ char16_t* Gecko_BeginWritingString(nsASt
return aThis->BeginWriting();
}
char16_t* Gecko_FallibleBeginWritingString(nsAString* aThis)
{
return aThis->BeginWriting(mozilla::fallible);
}
+uint32_t
+Gecko_StartBulkWriteString(nsAString* aThis,
+ uint32_t aCapacity,
+ uint32_t aUnitsToPreserve)
+{
+ return aThis->StartBulkWrite(aCapacity, aUnitsToPreserve);
+}
+
} // extern "C"
--- a/xpcom/string/nsTStringObsolete.cpp
+++ b/xpcom/string/nsTStringObsolete.cpp
@@ -320,24 +320,21 @@ nsTString<T>::ReplaceSubstring(const sel
"We should have the correct non-matching segment.");
return true;
}
// Make sure that we can mutate our buffer.
// Note that we always allocate at least an this->mLength sized buffer, because the
// rest of the algorithm relies on having access to all of the original
// string. In other words, we over-allocate in the shrinking case.
- char_type* oldData;
- DataFlags oldFlags;
- if (!this->MutatePrep(XPCOM_MAX(this->mLength, newLength.value()), &oldData, &oldFlags))
+ uint32_t oldLen = this->mLength;
+ uint32_t capacity =
+ this->StartBulkWrite(XPCOM_MAX(oldLen, newLength.value()), oldLen);
+ if (capacity == UINT32_MAX) {
return false;
- if (oldData) {
- // Copy all of the old data to the new buffer.
- char_traits::copy(this->mData, oldData, this->mLength);
- ::ReleaseData(oldData, oldFlags);
}
if (aTarget.Length() >= aNewValue.Length()) {
// In the shrinking case, start filling the buffer from the beginning.
const uint32_t delta = (aTarget.Length() - aNewValue.Length());
for (i = 1; i < nonMatching.Length(); ++i) {
// When we move the i'th non-matching segment into position, we need to
// account for the characters deleted by the previous |i| replacements by
@@ -365,18 +362,17 @@ nsTString<T>::ReplaceSubstring(const sel
// Write the i'th replacement immediately before the new i'th non-matching
// segment.
char_traits::copy(destinationSegmentPtr - aNewValue.Length(),
aNewValue.Data(), aNewValue.Length());
}
}
// Adjust the length and make sure the string is null terminated.
- this->mLength = newLength.value();
- this->mData[this->mLength] = char_type(0);
+ this->FinishBulkWrite(newLength.value());
return true;
}
/**
* nsTString::Trim
*/
--- a/xpcom/string/nsTSubstring.cpp
+++ b/xpcom/string/nsTSubstring.cpp
@@ -42,55 +42,82 @@ nsTSubstring<T>::nsTSubstring(char_type*
*/
template <typename T>
inline const nsTAutoString<T>*
AsAutoString(const nsTSubstring<T>* aStr)
{
return static_cast<const nsTAutoString<T>*>(aStr);
}
-/**
- * this function is called to prepare mData for writing. the given capacity
- * indicates the required minimum storage size for mData, in sizeof(char_type)
- * increments. this function returns true if the operation succeeds. it also
- * returns the old data and old flags members if mData is newly allocated.
- * the old data must be released by the caller.
- */
-template <typename T>
-bool
-nsTSubstring<T>::MutatePrep(size_type aCapacity, char_type** aOldData,
- DataFlags* aOldDataFlags)
+template<typename T>
+uint32_t
+nsTSubstring<T>::StartBulkWrite(size_type aCapacity,
+ size_type aPrefixToPreserve,
+ bool aAllowShrinking,
+ size_type aSuffixLength,
+ size_type aOldSuffixStart,
+ size_type aNewSuffixStart)
{
- // initialize to no old data
- *aOldData = nullptr;
- *aOldDataFlags = DataFlags(0);
+ // Note! Capacity does not include room for the terminating null char.
+
+ MOZ_ASSERT(aPrefixToPreserve <= aCapacity,
+ "Requested preservation of an overlong prefix.");
+ MOZ_ASSERT(aNewSuffixStart + aSuffixLength <= aCapacity,
+ "Requesed move of suffix to out-of-bounds location.");
+ // Can't assert aOldSuffixStart, because mLength may not be valid anymore,
+ // since this method allows itself to be called more than once.
+ // If zero capacity is requested, set the string to the special empty
+ // string.
+ if (MOZ_UNLIKELY(!aCapacity)) {
+ ::ReleaseData(this->mData, this->mDataFlags);
+ SetToEmptyBuffer();
+ this->mDataFlags &= ~DataFlags::VOIDED; // mutation clears voided flag
+ return 0;
+ }
+
+ // Note! Capacity() returns 0 when the string is immutable.
size_type curCapacity = Capacity();
- // If |aCapacity > kMaxCapacity|, then our doubling algorithm may not be
- // able to allocate it. Just bail out in cases like that. We don't want
- // to be allocating 2GB+ strings anyway.
- static_assert((sizeof(nsStringBuffer) & 0x1) == 0,
- "bad size for nsStringBuffer");
- if (!CheckCapacity(aCapacity)) {
- return false;
- }
-
+ // We've established that aCapacity > 0.
// |curCapacity == 0| means that the buffer is immutable or 0-sized, so we
// need to allocate a new buffer. We cannot use the existing buffer even
// though it might be large enough.
- if (curCapacity != 0) {
- if (aCapacity <= curCapacity) {
- this->mDataFlags &= ~DataFlags::VOIDED; // mutation clears voided flag
- return true;
- }
+ if (!aAllowShrinking && aCapacity <= curCapacity) {
+ char_traits::move(this->mData + aNewSuffixStart,
+ this->mData + aOldSuffixStart,
+ aSuffixLength);
+ return curCapacity;
}
- if (curCapacity < aCapacity) {
+ char_type* oldData = this->mData;
+ DataFlags oldFlags = this->mDataFlags;
+
+ char_type* newData;
+ DataFlags newDataFlags;
+ size_type newCapacity;
+
+ // If this is an nsTAutoStringN, it's possible that we can use the inline
+ // buffer.
+ if ((this->mClassFlags & ClassFlags::INLINE) &&
+ (aCapacity <= AsAutoString(this)->mInlineCapacity)) {
+ newCapacity = AsAutoString(this)->mInlineCapacity;
+ newData = (char_type*)AsAutoString(this)->mStorage;
+ newDataFlags = DataFlags::TERMINATED | DataFlags::INLINE;
+ } else {
+ // If |aCapacity > kMaxCapacity|, then our doubling algorithm may not be
+ // able to allocate it. Just bail out in cases like that. We don't want
+ // to be allocating 2GB+ strings anyway.
+ static_assert((sizeof(nsStringBuffer) & 0x1) == 0,
+ "bad size for nsStringBuffer");
+ if (MOZ_UNLIKELY(!CheckCapacity(aCapacity))) {
+ return UINT32_MAX;
+ }
+
// We increase our capacity so that the allocated buffer grows
// exponentially, which gives us amortized O(1) appending. Below the
// threshold, we use powers-of-two. Above the threshold, we grow by at
// least 1.125, rounding up to the nearest MiB.
const size_type slowGrowthThreshold = 8 * 1024 * 1024;
// nsStringBuffer allocates sizeof(nsStringBuffer) + passed size, and
// storageSize below wants extra 1 * sizeof(char_type).
@@ -108,87 +135,70 @@ nsTSubstring<T>::MutatePrep(size_type aC
const size_t MiB = 1 << 20;
temp = (MiB * ((temp + MiB - 1) / MiB)) - neededExtraSpace;
} else {
// Round up to the next power of two.
temp =
mozilla::RoundUpPow2(aCapacity + neededExtraSpace) - neededExtraSpace;
}
- MOZ_ASSERT(XPCOM_MIN(temp, kMaxCapacity) >= aCapacity,
+ newCapacity = XPCOM_MIN(temp, kMaxCapacity);
+ MOZ_ASSERT(newCapacity >= aCapacity,
"should have hit the early return at the top");
- aCapacity = XPCOM_MIN(temp, kMaxCapacity);
- }
-
- //
- // several cases:
- //
- // (1) we have a refcounted shareable buffer (this->mDataFlags &
- // DataFlags::REFCOUNTED)
- // (2) we have an owned buffer (this->mDataFlags & DataFlags::OWNED)
- // (3) we have an inline buffer (this->mDataFlags & DataFlags::INLINE)
- // (4) we have a readonly buffer
- //
- // requiring that we in some cases preserve the data before creating
- // a new buffer complicates things just a bit ;-)
- //
-
- size_type storageSize = (aCapacity + 1) * sizeof(char_type);
-
- // case #1
- if (this->mDataFlags & DataFlags::REFCOUNTED) {
- nsStringBuffer* hdr = nsStringBuffer::FromData(this->mData);
- if (!hdr->IsReadonly()) {
- nsStringBuffer* newHdr = nsStringBuffer::Realloc(hdr, storageSize);
+ // Avoid shinking if new buffer within 300 of the old. Note that
+ // signed underflow is defined behavior.
+ if ((curCapacity - newCapacity) <= 300 &&
+ (this->mDataFlags & DataFlags::REFCOUNTED)) {
+ MOZ_ASSERT(aAllowShrinking, "How come we didn't return earlier?");
+ // We're already close enough to the right size.
+ newData = oldData;
+ } else {
+ size_type storageSize = (newCapacity + 1) * sizeof(char_type);
+ // Since we allocate only if we need a different jemalloc bucket
+ // size, it's not useful to use realloc, which may spend time
+ // uselessly copying too much.
+ nsStringBuffer* newHdr = nsStringBuffer::Alloc(storageSize).take();
if (!newHdr) {
- return false; // out-of-memory (original header left intact)
+ return UINT32_MAX; // we are still in a consistent state
}
- hdr = newHdr;
- this->mData = (char_type*)hdr->Data();
- this->mDataFlags &= ~DataFlags::VOIDED; // mutation clears voided flag
- return true;
+ newData = (char_type*)newHdr->Data();
}
- }
-
- char_type* newData;
- DataFlags newDataFlags;
-
- // If this is an nsTAutoStringN whose inline buffer is sufficiently large,
- // then use it. This helps avoid heap allocations.
- if ((this->mClassFlags & ClassFlags::INLINE) &&
- (aCapacity < AsAutoString(this)->mInlineCapacity)) {
- newData = (char_type*)AsAutoString(this)->mStorage;
- newDataFlags = DataFlags::TERMINATED | DataFlags::INLINE;
- } else {
- // if we reach here then, we must allocate a new buffer. we cannot
- // make use of our DataFlags::OWNED or DataFlags::INLINE buffers because
- // they are not large enough.
-
- nsStringBuffer* newHdr =
- nsStringBuffer::Alloc(storageSize).take();
- if (!newHdr) {
- return false; // we are still in a consistent state
- }
-
- newData = (char_type*)newHdr->Data();
newDataFlags = DataFlags::TERMINATED | DataFlags::REFCOUNTED;
}
- // save old data and flags
- *aOldData = this->mData;
- *aOldDataFlags = this->mDataFlags;
+ this->mData = newData;
+ this->mDataFlags = newDataFlags;
+
+ if (oldData == newData) {
+ char_traits::move(
+ newData + aNewSuffixStart, oldData + aOldSuffixStart, aSuffixLength);
+ } else {
+ char_traits::copy(newData, oldData, aPrefixToPreserve);
+ char_traits::copy(
+ newData + aNewSuffixStart, oldData + aOldSuffixStart, aSuffixLength);
+ ::ReleaseData(oldData, oldFlags);
+ }
- // this->mLength does not change
- SetData(newData, this->mLength, newDataFlags);
+ return newCapacity;
+}
- // though we are not necessarily terminated at the moment, now is probably
- // still the best time to set DataFlags::TERMINATED.
-
- return true;
+template<typename T>
+void
+nsTSubstring<T>::FinishBulkWrite(size_type aLength)
+{
+ MOZ_ASSERT(aLength != UINT32_MAX, "OOM magic value passed as length.");
+ if (aLength) {
+ this->mData[aLength] = char_type(0);
+ this->mLength = aLength;
+ } else {
+ ::ReleaseData(this->mData, this->mDataFlags);
+ SetToEmptyBuffer();
+ }
+ AssertValid();
}
template <typename T>
void
nsTSubstring<T>::Finalize()
{
::ReleaseData(this->mData, this->mDataFlags);
// this->mData, this->mLength, and this->mDataFlags are purposefully left dangling
@@ -220,58 +230,26 @@ nsTSubstring<T>::ReplacePrep(index_type
newTotalLen.value());
}
template <typename T>
bool
nsTSubstring<T>::ReplacePrepInternal(index_type aCutStart, size_type aCutLen,
size_type aFragLen, size_type aNewLen)
{
- char_type* oldData;
- DataFlags oldFlags;
- if (!MutatePrep(aNewLen, &oldData, &oldFlags)) {
- return false; // out-of-memory
- }
-
- if (oldData) {
- // determine whether or not we need to copy part of the old string
- // over to the new string.
-
- if (aCutStart > 0) {
- // copy prefix from old string
- char_traits::copy(this->mData, oldData, aCutStart);
- }
+ size_type newSuffixStart = aCutStart + aFragLen;
+ size_type oldSuffixStart = aCutStart + aCutLen;
+ size_type suffixLength = this->mLength - oldSuffixStart;
- if (aCutStart + aCutLen < this->mLength) {
- // copy suffix from old string to new offset
- size_type from = aCutStart + aCutLen;
- size_type fromLen = this->mLength - from;
- uint32_t to = aCutStart + aFragLen;
- char_traits::copy(this->mData + to, oldData + from, fromLen);
- }
-
- ::ReleaseData(oldData, oldFlags);
- } else {
- // original data remains intact
-
- // determine whether or not we need to move part of the existing string
- // to make room for the requested hole.
- if (aFragLen != aCutLen && aCutStart + aCutLen < this->mLength) {
- uint32_t from = aCutStart + aCutLen;
- uint32_t fromLen = this->mLength - from;
- uint32_t to = aCutStart + aFragLen;
- char_traits::move(this->mData + to, this->mData + from, fromLen);
- }
+ size_type capacity = StartBulkWrite(
+ aNewLen, aCutStart, true, suffixLength, oldSuffixStart, newSuffixStart);
+ if (capacity == UINT32_MAX) {
+ return false;
}
-
- // add null terminator (mutable this->mData always has room for the null-
- // terminator).
- this->mData[aNewLen] = char_type(0);
- this->mLength = aNewLen;
-
+ FinishBulkWrite(aNewLen);
return true;
}
template <typename T>
typename nsTSubstring<T>::size_type
nsTSubstring<T>::Capacity() const
{
// return 0 to indicate an immutable or 0-sized buffer
@@ -556,30 +534,24 @@ nsTSubstring<T>::Assign(const substring_
{
if (aTuple.IsDependentOn(this->mData, this->mData + this->mLength)) {
// take advantage of sharing here...
return Assign(string_type(aTuple), aFallible);
}
size_type length = aTuple.Length();
- // don't use ReplacePrep here because it changes the length
- char_type* oldData;
- DataFlags oldFlags;
- if (!MutatePrep(length, &oldData, &oldFlags)) {
+ size_type capacity = StartBulkWrite(length);
+ if (capacity == UINT32_MAX) {
return false;
}
- if (oldData) {
- ::ReleaseData(oldData, oldFlags);
- }
+ aTuple.WriteTo(this->mData, length);
- aTuple.WriteTo(this->mData, length);
- this->mData[length] = 0;
- this->mLength = length;
+ FinishBulkWrite(length);
return true;
}
template <typename T>
void
nsTSubstring<T>::Adopt(char_type* aData, size_type aLength)
{
if (aData) {
@@ -765,50 +737,57 @@ nsTSubstring<T>::SetCapacity(size_type a
}
template <typename T>
bool
nsTSubstring<T>::SetCapacity(size_type aCapacity, const fallible_t&)
{
// capacity does not include room for the terminating null char
- // if our capacity is reduced to zero, then free our buffer.
- if (aCapacity == 0) {
- ::ReleaseData(this->mData, this->mDataFlags);
- SetToEmptyBuffer();
- return true;
+ // Sadly, existing callers assume that it's valid to
+ // first call SetCapacity(), then write past mLength
+ // and then call SetLength() with the assumption that
+ // SetLength still preserves the written data past
+ // mLength!!!
+
+ size_type preserve;
+ if (this->mDataFlags & DataFlags::REFCOUNTED) {
+ nsStringBuffer* hdr = nsStringBuffer::FromData(this->mData);
+ preserve = (hdr->StorageSize() / sizeof(char_type)) - 1;
+ } else if (this->mDataFlags & DataFlags::INLINE) {
+ preserve = AsAutoString(this)->mInlineCapacity;
+ } else {
+ preserve = this->mLength;
}
- char_type* oldData;
- DataFlags oldFlags;
- if (!MutatePrep(aCapacity, &oldData, &oldFlags)) {
- return false; // out-of-memory
+ if (preserve > aCapacity) {
+ preserve = aCapacity;
}
- // compute new string length
- size_type newLen = XPCOM_MIN(this->mLength, aCapacity);
-
- if (oldData) {
- // preserve old data
- if (this->mLength > 0) {
- char_traits::copy(this->mData, oldData, newLen);
- }
-
- ::ReleaseData(oldData, oldFlags);
+ size_type capacity = StartBulkWrite(aCapacity, preserve);
+ if (capacity == UINT32_MAX) {
+ return false;
}
+ if (capacity) {
+ // In the zero case StartBulkWrite already put the string
+ // in a valid state.
- // adjust this->mLength if our buffer shrunk down in size
- if (newLen < this->mLength) {
- this->mLength = newLen;
+ // Otherwise, instead of calling FinishBulkWrite,
+ // intentionally leave the string in the weird state
+ // required by the legacy semantics of this method.
+ if (aCapacity < this->mLength) {
+ // aCapacity not capacity for legacy reasons;
+ // maybe capacity would work, too.
+ this->mLength = aCapacity;
+ }
+ // Note that we can't write a terminator at
+ // mData[mLength], because doing so would overwrite
+ // data when this method is called from SetLength.
+ this->mData[aCapacity] = char_type(0);
}
-
- // always null-terminate here, even if the buffer got longer. this is
- // for backwards compat with the old string implementation.
- this->mData[aCapacity] = char_type(0);
-
return true;
}
template <typename T>
void
nsTSubstring<T>::SetLength(size_type aLength)
{
SetCapacity(aLength);
--- a/xpcom/string/nsTSubstring.h
+++ b/xpcom/string/nsTSubstring.h
@@ -895,38 +895,74 @@ protected:
/**
* this function releases mData and does not change the value of
* any of its member variables. in other words, this function acts
* like a destructor.
*/
void NS_FASTCALL Finalize();
+public:
/**
- * this function prepares mData to be mutated.
+ * Prepares mData to be mutated such that the capacity of the string
+ * (not counting the zero-terminator) is at least aCapacity.
+ * Returns the actual capacity, which may be larger than what was
+ * requested or UINT32_MAX on allocation failure.
+ *
+ * mLength is ignored by this method. If the buffer is reallocated,
+ * aUnitsToPreserve specifies how many code units to copy over to
+ * the new buffer. The old buffer is freed if applicable.
*
- * @param aCapacity specifies the required capacity of mData
- * @param aOldData returns null or the old value of mData
- * @param aOldFlags returns 0 or the old value of mDataFlags
+ * Unless the return value is UINT32_MAX to signal failure or 0 to
+ * signal that the string has been set to the special empty state,
+ * this method leaves the string in an invalid state! The caller is
+ * responsible for calling FinishBulkWrite() (or in Rust calling
+ * nsA[C]StringBulkWriteHandle::finish()), which put the string
+ * into a valid state by setting mLength and zero-terminating.
+ * This method sets the flag to claim that the string is
+ * zero-terminated before it actually is.
+ *
+ * Once this method has been called and before FinishBulkWrite()
+ * has been called, only calls to Data() or this method again
+ * are valid. Do not call any other methods between calling this
+ * method and FinishBulkWrite().
*
- * if mData is already mutable and of sufficient capacity, then this
- * function will return immediately. otherwise, it will either resize
- * mData or allocate a new shared buffer. if it needs to allocate a
- * new buffer, then it will return the old buffer and the corresponding
- * flags. this allows the caller to decide when to free the old data.
+ * @param aCapacity The requested capacity. The return value
+ * will be greater than or equal to this value.
+ * @param aPrefixToPreserve The number of code units at the start
+ * of the old buffer to copy into the
+ * new buffer.
+ * @parem aAllowShrinking If true, an allocation may be performed
+ * if the requested capacity is smaller
+ * than the current capacity.
+ * @param aSuffixLength The length, in code units, of a suffix
+ * to move.
+ * @param aOldSuffixStart The old start index of the suffix to
+ * move.
+ * @param aNewSuffixStart The new start index of the suffix to
+ * move.
*
- * this function returns false if is unable to allocate sufficient
- * memory.
- *
- * XXX we should expose a way for subclasses to free old_data.
*/
- bool NS_FASTCALL MutatePrep(size_type aCapacity,
- char_type** aOldData, DataFlags* aOldDataFlags);
+ uint32_t NS_FASTCALL StartBulkWrite(size_type aCapacity,
+ size_type aPrefixToPreserve = 0,
+ bool aAllowShrinking = true,
+ size_type aSuffixLength = 0,
+ size_type aOldSuffixStart = 0,
+ size_type aNewSuffixStart = 0);
/**
+ * Restores the string to a valid state after a call to StartBulkWrite()
+ * that returned a non-UINT32_MAX value. The argument to this method
+ * must be less than or equal to the non-UINT32_MAX value returned by
+ * the most recent StartBulkWrite() call.
+ */
+ void NS_FASTCALL FinishBulkWrite(size_type aLength);
+
+protected:
+ /**
* this function prepares a section of mData to be modified. if
* necessary, this function will reallocate mData and possibly move
* existing data to open up the specified section.
*
* @param aCutStart specifies the starting offset of the section
* @param aCutLength specifies the length of the section to be replaced
* @param aNewLength specifies the length of the new section
*
--- a/xpcom/string/nsUTF8Utils.h
+++ b/xpcom/string/nsUTF8Utils.h
@@ -6,20 +6,18 @@
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_
// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
// file will provide signatures for the Mozilla abstract string types. It will
// use XPCOM assertion/debugging macros, etc.
#include "nscore.h"
-#include "mozilla/arm.h"
#include "mozilla/Assertions.h"
#include "mozilla/EndianUtils.h"
-#include "mozilla/SSE.h"
#include "mozilla/TypeTraits.h"
#include "nsCharTraits.h"
#ifdef MOZILLA_INTERNAL_API
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
#else
#define UTF8UTILS_WARNING(msg)
@@ -66,721 +64,196 @@ public:
return 2;
}
if (is3byte(aChar)) {
return 3;
}
if (is4byte(aChar)) {
return 4;
}
- if (is5byte(aChar)) {
- return 5;
- }
- if (is6byte(aChar)) {
- return 6;
- }
MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
return 1;
}
};
/**
- * Extract the next UCS-4 character from the buffer and return it. The
+ * Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the parameters err and overlong are filled in to
- * indicate that the character was represented by an overlong sequence, or
- * that an error occurred.
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
+ * over the maximal valid prefix and *aErr is set to true (if aErr is not
+ * null).
+ *
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
+ * Precondition: *aBuffer < aEnd
*/
-
class UTF8CharEnumerator
{
public:
- static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
+ static inline char32_t NextChar(const char** aBuffer,
+ const char* aEnd,
+ bool* aErr = nullptr)
{
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
-
- const char* p = *aBuffer;
- *aErr = false;
-
- if (p >= aEnd) {
- *aErr = true;
-
- return 0;
- }
-
- char c = *p++;
-
- if (UTF8traits::isASCII(c)) {
- *aBuffer = p;
- return c;
- }
-
- uint32_t ucs4;
- uint32_t minUcs4;
- int32_t state = 0;
+ MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+ MOZ_ASSERT(aEnd, "null end pointer");
- if (!CalcState(c, ucs4, minUcs4, state)) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- *aErr = true;
-
- return 0;
- }
+ const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
+ const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
- while (state--) {
- if (p == aEnd) {
- *aErr = true;
-
- return 0;
- }
+ MOZ_ASSERT(p, "null buffer");
+ MOZ_ASSERT(p < end, "Bogus range");
- c = *p++;
-
- if (!AddByte(c, state, ucs4)) {
- *aErr = true;
+ unsigned char first = *p++;
- return 0;
- }
- }
-
- if (ucs4 < minUcs4) {
- // Overlong sequence
- ucs4 = UCS2_REPLACEMENT_CHAR;
- } else if (ucs4 >= 0xD800 &&
- (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
- // Surrogates and code points outside the Unicode range.
- ucs4 = UCS2_REPLACEMENT_CHAR;
+ if (MOZ_LIKELY(first < 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(p);
+ return first;
}
- *aBuffer = p;
- return ucs4;
- }
-
-private:
- static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
- int32_t& aState)
- {
- if (UTF8traits::is2byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
- aState = 1;
- aMinUcs4 = 0x00000080;
- } else if (UTF8traits::is3byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
- aState = 2;
- aMinUcs4 = 0x00000800;
- } else if (UTF8traits::is4byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
- aState = 3;
- aMinUcs4 = 0x00010000;
- } else if (UTF8traits::is5byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
- aState = 4;
- aMinUcs4 = 0x00200000;
- } else if (UTF8traits::is6byte(aChar)) {
- aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
- aState = 5;
- aMinUcs4 = 0x04000000;
- } else {
- return false;
+ // Unsigned underflow is defined behavior
+ if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
}
- return true;
- }
-
- static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
- {
- if (UTF8traits::isInSeq(aChar)) {
- int32_t shift = aState * 6;
- aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
- return true;
- }
-
- return false;
- }
-};
-
+ unsigned char second = *p;
-/**
- * Extract the next UCS-4 character from the buffer and return it. The
- * pointer passed in is advanced to the start of the next character in the
- * buffer. If non-null, the err parameter is filled in if an error occurs.
- *
- * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
- * the buffer will be updated to move only a single UCS-2 character.
- *
- * Any other error returns 0 and does not move the buffer position.
- */
-
-
-class UTF16CharEnumerator
-{
-public:
- static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
- bool* aErr = nullptr)
- {
- NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
-
- const char16_t* p = *aBuffer;
-
- if (p >= aEnd) {
- NS_ERROR("No input to work with");
+ if (first < 0xE0U) {
+ // Two-byte
+ if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
+ }
+ *aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
-
- return 0;
+ return 0xFFFDU;
}
- char16_t c = *p++;
-
- if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
- if (aErr) {
- *aErr = false;
- }
- *aBuffer = p;
- return c;
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- if (p == aEnd) {
- // Found a high surrogate at the end of the buffer. Flag this
- // as an error and return the Unicode replacement
- // character 0xFFFD.
-
- UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
-
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p;
- return 0xFFFD;
+ if (MOZ_LIKELY(first < 0xF0U)) {
+ // Three-byte
+ unsigned char lower = 0x80U;
+ unsigned char upper = 0xBFU;
+ if (first == 0xE0U) {
+ lower = 0xA0U;
+ } else if (first == 0xEDU) {
+ upper = 0x9FU;
}
-
- // D800- DBFF - High Surrogate
- char16_t h = c;
-
- c = *p++;
-
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + (L - DC00)
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
- if (aErr) {
- *aErr = false;
+ if (MOZ_LIKELY(second >= lower && second <= upper)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char third = *++p;
+ if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0xFU) << 12) |
+ ((uint32_t(second) & 0x3FU) << 6) |
+ (uint32_t(third) & 0x3FU);
+ }
}
- *aBuffer = p;
- return ucs4;
- } else {
- // Found a high surrogate followed by something other than
- // a low surrogate. Flag this as an error and return the
- // Unicode replacement character 0xFFFD. Note that the
- // pointer to the next character points to the second 16-bit
- // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must be
- // treated as an illegally terminated code unit sequence
- // (also Chapter 3 D91, "isolated [not paired and ill-formed]
- // UTF-16 code units in the range D800..DFFF are ill-formed").
- UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
-
- if (aErr) {
- *aErr = true;
- }
- *aBuffer = p - 1;
- return 0xFFFD;
}
- } else { // U+DC00 - U+DFFF
- // DC00- DFFF - Low Surrogate
-
- // Found a low surrogate w/o a preceding high surrogate. Flag
- // this as an error and return the Unicode replacement
- // character 0xFFFD.
-
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
+ *aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
- *aBuffer = p;
- return 0xFFFD;
+ return 0xFFFDU;
}
- MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
- }
-};
-
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for converting
- * UTF-8 to UTF-16
- */
-class ConvertUTF8toUTF16
-{
-public:
- typedef char value_type;
- typedef char16_t buffer_type;
-
- explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
- {
- }
-
- size_t Length() const
- {
- return mBuffer - mStart;
- }
-
- bool ErrorEncountered() const
- {
- return mErrorEncountered;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- if (mErrorEncountered) {
- return;
+ // Four-byte
+ unsigned char lower = 0x80U;
+ unsigned char upper = 0xBFU;
+ if (first == 0xF0U) {
+ lower = 0x90U;
+ } else if (first == 0xF4U) {
+ upper = 0x8FU;
}
-
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- buffer_type* out = mBuffer;
- for (; p != end /* && *p */;) {
- bool err;
- uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
-
- if (err) {
- mErrorEncountered = true;
- mBuffer = out;
- return;
- }
-
- if (ucs4 >= PLANE1_BASE) {
- *out++ = (buffer_type)H_SURROGATE(ucs4);
- *out++ = (buffer_type)L_SURROGATE(ucs4);
- } else {
- *out++ = ucs4;
+ if (MOZ_LIKELY(second >= lower && second <= upper)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char third = *++p;
+ if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
+ if (MOZ_LIKELY(p != end)) {
+ unsigned char fourth = *++p;
+ if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
+ *aBuffer = reinterpret_cast<const char*>(++p);
+ return ((uint32_t(first) & 0x7U) << 18) |
+ ((uint32_t(second) & 0x3FU) << 12) |
+ ((uint32_t(third) & 0x3FU) << 6) |
+ (uint32_t(fourth) & 0x3FU);
+ }
+ }
+ }
}
}
- mBuffer = out;
+ *aBuffer = reinterpret_cast<const char*>(p);
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
}
-
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
-
-private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
- bool mErrorEncountered;
};
/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the length of the UTF-16 string equivalent to a UTF-8 string.
+ * Extract the next Unicode scalar value from the buffer and return it. The
+ * pointer passed in is advanced to the start of the next character in the
+ * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
+ * the unpaired surrogate and *aErr is set to true (if aErr is not null).
+ *
+ * Note: This method never sets *aErr to false to allow error accumulation
+ * across multiple calls.
+ *
+ * Precondition: *aBuffer < aEnd
*/
-class CalculateUTF8Length
+class UTF16CharEnumerator
{
public:
- typedef char value_type;
-
- CalculateUTF8Length()
- : mLength(0), mErrorEncountered(false)
- {
- }
-
- size_t Length() const
- {
- return mLength;
- }
-
- void write(const value_type* aStart, uint32_t aN)
+ static inline char32_t NextChar(const char16_t** aBuffer,
+ const char16_t* aEnd,
+ bool* aErr = nullptr)
{
- // ignore any further requests
- if (mErrorEncountered) {
- return;
- }
-
- // algorithm assumes utf8 units won't
- // be spread across fragments
- const value_type* p = aStart;
- const value_type* end = aStart + aN;
- for (; p < end /* && *p */; ++mLength) {
- if (UTF8traits::isASCII(*p)) {
- p += 1;
- } else if (UTF8traits::is2byte(*p)) {
- p += 2;
- } else if (UTF8traits::is3byte(*p)) {
- p += 3;
- } else if (UTF8traits::is4byte(*p)) {
- // Because a UTF-8 sequence of 4 bytes represents a codepoint
- // greater than 0xFFFF, it will become a surrogate pair in the
- // UTF-16 string, so add 1 more to mLength.
- // This doesn't happen with is5byte and is6byte because they
- // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
- // converted to a single replacement character.
-
- // However, there is one case when a 4 byte UTF-8 sequence will
- // only generate 2 UTF-16 bytes. If we have a properly encoded
- // sequence, but with an invalid value (too small or too big),
- // that will result in a replacement character being written
- // This replacement character is encoded as just 1 single
- // UTF-16 character, which is 2 bytes.
+ MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
+ MOZ_ASSERT(aEnd, "null end pointer");
- // The below code therefore only adds 1 to mLength if the UTF8
- // data will produce a decoded character which is greater than
- // or equal to 0x010000 and less than 0x0110000.
-
- // A 4byte UTF8 character is encoded as
- // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
- // map to bit 17-21 in the final result. If these bits are
- // between 0x01 and 0x11, that means that the final result is
- // between 0x010000 and 0x110000. The below code reads these
- // bits out and assigns them to c, but shifted up 4 bits to
- // avoid having to shift twice.
-
- // It doesn't matter what to do in the case where p + 4 > end
- // since no UTF16 characters will be written in that case by
- // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
- // any of the surrogate bits are wrong since no UTF16
- // characters will be written in that case either.
+ const char16_t* p = *aBuffer;
- if (p + 4 <= end) {
- uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
- ((uint32_t)(p[1] & 0x30));
- if (c >= 0x010 && c < 0x110) {
- ++mLength;
- }
- }
+ MOZ_ASSERT(p, "null buffer");
+ MOZ_ASSERT(p < aEnd, "Bogus range");
- p += 4;
- } else if (UTF8traits::is5byte(*p)) {
- p += 5;
- } else if (UTF8traits::is6byte(*p)) {
- p += 6;
- } else { // error
- ++mLength; // to account for the decrement below
- break;
- }
- }
- if (p != end) {
- NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
- --mLength; // The last multi-byte char wasn't complete, discard it.
- mErrorEncountered = true;
- }
- }
-
-private:
- size_t mLength;
- bool mErrorEncountered;
-};
+ char16_t c = *p++;
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for
- * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
- * (0xEFBFBD in UTF-8).
- */
-class ConvertUTF16toUTF8
-{
-public:
- typedef char16_t value_type;
- typedef char buffer_type;
-
- // The error handling here is more lenient than that in
- // |ConvertUTF8toUTF16|, but it's that way for backwards
- // compatibility.
-
- explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
- : mStart(aBuffer), mBuffer(aBuffer)
- {
- }
-
- size_t Size() const
- {
- return mBuffer - mStart;
- }
-
- void write(const value_type* aStart, uint32_t aN)
- {
- buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
-
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- *out++ = (char)c;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- *out++ = 0xC0 | (char)(c >> 6);
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- *out++ = 0xE0 | (char)(c >> 12);
- *out++ = 0x80 | (char)(0x003F & (c >> 6));
- *out++ = 0x80 | (char)(0x003F & c);
- } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
- // D800- DBFF - High Surrogate
- value_type h = c;
-
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
- break;
+ // Let's use encoding_rs-style code golf here.
+ // Unsigned underflow is defined behavior
+ char16_t cMinusSurrogateStart = c - 0xD800U;
+ if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
+ *aBuffer = p;
+ return c;
+ }
+ if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
+ // High surrogate
+ if (MOZ_LIKELY(p != aEnd)) {
+ char16_t second = *p;
+ // Unsigned underflow is defined behavior
+ if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
+ *aBuffer = ++p;
+ return (uint32_t(c) << 10) + uint32_t(second) -
+ (((0xD800U << 10) - 0x10000U) + 0xDC00U);
}
- c = *p;
-
- if (NS_IS_LOW_SURROGATE(c)) {
- // DC00- DFFF - Low Surrogate
- // N = (H - D800) *400 + 10000 + ( L - DC00 )
- uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
-
- // 0001 0000-001F FFFF
- *out++ = 0xF0 | (char)(ucs4 >> 18);
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
- *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
- *out++ = 0x80 | (char)(0x003F & ucs4);
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- // The pointer to the next character points to the second
- // 16-bit value, not beyond it, as per Unicode 5.0.0
- // Chapter 3 C10, only the first code unit of an illegal
- // sequence must be treated as an illegally terminated
- // code unit sequence (also Chapter 3 D91, "isolated [not
- // paired and ill-formed] UTF-16 code units in the range
- // D800..DFFF are ill-formed").
- p--;
-
- UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- *out++ = '\xEF';
- *out++ = '\xBF';
- *out++ = '\xBD';
-
- // DC00- DFFF - Low Surrogate
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
}
}
-
- mBuffer = out;
- }
-
- void write_terminator()
- {
- *mBuffer = buffer_type(0);
- }
-
-private:
- buffer_type* const mStart;
- buffer_type* mBuffer;
-};
-
-/**
- * A character sink (see |copy_string| in nsAlgorithm.h) for computing
- * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
- * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
- */
-class CalculateUTF8Size
-{
-public:
- typedef char16_t value_type;
-
- CalculateUTF8Size()
- : mSize(0)
- {
- }
-
- size_t Size() const
- {
- return mSize;
+ // Unpaired surrogate
+ *aBuffer = p;
+ if (aErr) {
+ *aErr = true;
+ }
+ return 0xFFFDU;
}
-
- void write(const value_type* aStart, uint32_t aN)
- {
- // Assume UCS2 surrogate pairs won't be spread across fragments.
- for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
- value_type c = *p;
- if (!(c & 0xFF80)) { // U+0000 - U+007F
- mSize += 1;
- } else if (!(c & 0xF800)) { // U+0100 - U+07FF
- mSize += 2;
- } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
- mSize += 3;
- } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
- ++p;
- if (p == end) {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
-
- UTF8UTILS_WARNING("String ending in half a surrogate pair!");
-
- break;
- }
- c = *p;
-
- if (0xDC00 == (0xFC00 & c)) {
- mSize += 4;
- } else {
- // Treat broken characters as the Unicode
- // replacement character 0xFFFD (0xEFBFBD in
- // UTF-8)
- mSize += 3;
-
- // The next code unit is the second 16-bit value, not
- // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
- // only the first code unit of an illegal sequence must
- // be treated as an illegally terminated code unit
- // sequence (also Chapter 3 D91, "isolated [not paired and
- // ill-formed] UTF-16 code units in the range D800..DFFF
- // are ill-formed").
- p--;
-
- UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
- }
- } else { // U+DC00 - U+DFFF
- // Treat broken characters as the Unicode replacement
- // character 0xFFFD (0xEFBFBD in UTF-8)
- mSize += 3;
-
- UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
- }
- }
- }
-
-private:
- size_t mSize;
};
-#ifdef MOZILLA_INTERNAL_API
-/**
- * A character sink that performs a |reinterpret_cast|-style conversion
- * from char to char16_t.
- */
-class LossyConvertEncoding8to16
-{
-public:
- typedef char value_type;
- typedef char input_type;
- typedef char16_t output_type;
-
-public:
- explicit LossyConvertEncoding8to16(char16_t* aDestination) :
- mDestination(aDestination)
- {
- }
-
- void
- write(const char* aSource, uint32_t aSourceLength)
- {
-#ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- write_sse2(aSource, aSourceLength);
- return;
- }
-#endif
-#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
- if (mozilla::supports_neon()) {
- write_neon(aSource, aSourceLength);
- return;
- }
-#endif
- const char* done_writing = aSource + aSourceLength;
- while (aSource < done_writing) {
- *mDestination++ = (char16_t)(unsigned char)(*aSource++);
- }
- }
-
- void
- write_sse2(const char* aSource, uint32_t aSourceLength);
-#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
- void
- write_neon(const char* aSource, uint32_t aSourceLength);
-#endif
-
- void
- write_terminator()
- {
- *mDestination = (char16_t)(0);
- }
-
-private:
- char16_t* mDestination;
-};
-
-/**
- * A character sink that performs a |reinterpret_cast|-style conversion
- * from char16_t to char.
- */
-class LossyConvertEncoding16to8
-{
-public:
- typedef char16_t value_type;
- typedef char16_t input_type;
- typedef char output_type;
-
- explicit LossyConvertEncoding16to8(char* aDestination)
- : mDestination(aDestination)
- {
- }
-
- void
- write(const char16_t* aSource, uint32_t aSourceLength)
- {
-#ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- write_sse2(aSource, aSourceLength);
- return;
- }
-#endif
-#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
- if (mozilla::supports_neon()) {
- write_neon(aSource, aSourceLength);
- return;
- }
-#endif
- const char16_t* done_writing = aSource + aSourceLength;
- while (aSource < done_writing) {
- *mDestination++ = (char)(*aSource++);
- }
- }
-
-#ifdef MOZILLA_MAY_SUPPORT_SSE2
- void
- write_sse2(const char16_t* aSource, uint32_t aSourceLength);
-#endif
-#if defined(MOZILLA_MAY_SUPPORT_NEON) && defined(MOZ_LITTLE_ENDIAN)
- void
- write_neon(const char16_t* aSource, uint32_t aSourceLength);
-#endif
-
- void
- write_terminator()
- {
- *mDestination = '\0';
- }
-
-private:
- char* mDestination;
-};
-#endif // MOZILLA_INTERNAL_API
-
-
template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
{
static_assert(mozilla::IsSame<Char, char>::value ||
mozilla::IsSame<Char, unsigned char>::value ||
mozilla::IsSame<Char, signed char>::value,
"UTF-8 data must be in 8-bit units");
--- a/xpcom/tests/gtest/TestAtoms.cpp
+++ b/xpcom/tests/gtest/TestAtoms.cpp
@@ -77,39 +77,40 @@ TEST(Atoms, Invalid)
{
RefPtr<nsAtom> atom16 = NS_Atomize(Invalid16Strings[i].m16);
EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid16Strings[i].m16)));
}
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
-
+#ifndef DEBUG
+// Don't run this test in debug builds as that intentionally asserts.
for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
nsrefcnt count = NS_GetNumberOfAtoms();
{
RefPtr<nsAtom> atom8 = NS_Atomize(Invalid8Strings[i].m8);
RefPtr<nsAtom> atom16 = NS_Atomize(Invalid8Strings[i].m16);
EXPECT_EQ(atom16, atom8);
EXPECT_TRUE(atom16->Equals(nsDependentString(Invalid8Strings[i].m16)));
}
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
-// Don't run this test in debug builds as that intentionally asserts.
-#ifndef DEBUG
- RefPtr<nsAtom> emptyAtom = NS_Atomize("");
-
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
nsrefcnt count = NS_GetNumberOfAtoms();
- RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i]);
- EXPECT_EQ(atom8, emptyAtom);
+ {
+ RefPtr<nsAtom> atom8 = NS_Atomize(Malformed8Strings[i].m8);
+ RefPtr<nsAtom> atom16 = NS_Atomize(Malformed8Strings[i].m16);
+ EXPECT_EQ(atom8, atom16);
+ }
+
EXPECT_EQ(count, NS_GetNumberOfAtoms());
}
#endif
}
#define FIRST_ATOM_STR "first static atom. Hello!"
#define SECOND_ATOM_STR "second static atom. @World!"
#define THIRD_ATOM_STR "third static atom?!"
--- a/xpcom/tests/gtest/TestStrings.cpp
+++ b/xpcom/tests/gtest/TestStrings.cpp
@@ -764,22 +764,20 @@ TEST_F(Strings, replace_substr)
s.AssignLiteral("foofoofoo");
s.ReplaceSubstring("of", "fo");
EXPECT_STREQ(s.get(), "fofoofooo");
}
TEST_F(Strings, replace_substr_2)
{
- const char *oldName = nullptr;
const char *newName = "user";
nsString acctName; acctName.AssignLiteral("forums.foo.com");
nsAutoString newAcctName, oldVal, newVal;
- CopyASCIItoUTF16(oldName, oldVal);
- CopyASCIItoUTF16(newName, newVal);
+ CopyASCIItoUTF16(mozilla::MakeStringSpan(newName), newVal);
newAcctName.Assign(acctName);
// here, oldVal is empty. we are testing that this function
// does not hang. see bug 235355.
newAcctName.ReplaceSubstring(oldVal, newVal);
// we expect that newAcctName will be unchanged.
EXPECT_TRUE(newAcctName.Equals(acctName));
@@ -1282,16 +1280,45 @@ TEST(String, strip_chars)
test_strip_chars_helper(u"foo",
u"foo",
NS_LITERAL_STRING(""));
test_strip_chars_helper(u" foo",
u" ",
NS_LITERAL_STRING("foo"));
}
+TEST_F(Strings, append_with_capacity)
+{
+ nsAutoString s;
+ const char16_t* origPtr = s.BeginReading();
+ s.SetCapacity(100);
+ const char16_t* ptr = s.BeginReading();
+ EXPECT_NE(origPtr, ptr);
+ for (int i = 0; i < 100; i++) {
+ s.Append(u'a');
+ EXPECT_EQ(s.BeginReading(), ptr);
+ }
+}
+
+TEST_F(Strings, append_string_with_capacity)
+{
+ nsAutoString aa;
+ aa.Append(u'a');
+ aa.Append(u'a');
+ nsAutoString s;
+ const char16_t* origPtr = s.BeginReading();
+ s.SetCapacity(200);
+ const char16_t* ptr = s.BeginReading();
+ EXPECT_NE(origPtr, ptr);
+ for (int i = 0; i < 100; i++) {
+ s.Append(aa);
+ EXPECT_EQ(s.BeginReading(), ptr);
+ }
+}
+
TEST_F(Strings, huge_capacity)
{
nsString a, b, c, d, e, f, g, h, i, j, k, l, m, n;
nsCString n1;
// Ignore the result if the address space is less than 64-bit because
// some of the allocations above will exhaust the address space.
if (sizeof(void*) >= 8) {
--- a/xpcom/tests/gtest/TestTextFormatter.cpp
+++ b/xpcom/tests/gtest/TestTextFormatter.cpp
@@ -12,17 +12,16 @@ TEST(TextFormatter, Tests)
nsAutoString fmt(NS_LITERAL_STRING("%3$s %4$S %1$d %2$d %2$d %3$s"));
char utf8[] = "Hello";
char16_t ucs2[]={'W', 'o', 'r', 'l', 'd', 0x4e00, 0xAc00, 0xFF45, 0x0103, 0x00};
int d=3;
char16_t buf[256];
nsTextFormatter::snprintf(buf, 256, fmt.get(), d, 333, utf8, ucs2);
nsAutoString out(buf);
- ASSERT_STREQ("Hello World", NS_LossyConvertUTF16toASCII(out).get());
const char16_t *uout = out.get();
const char16_t expected[] = {0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20,
0x57, 0x6F, 0x72, 0x6C, 0x64, 0x4E00,
0xAC00, 0xFF45, 0x0103, 0x20, 0x33,
0x20, 0x33, 0x33, 0x33, 0x20, 0x33,
0x33, 0x33, 0x20, 0x48, 0x65, 0x6C,
0x6C, 0x6F};
--- a/xpcom/tests/gtest/TestUTF.cpp
+++ b/xpcom/tests/gtest/TestUTF.cpp
@@ -9,16 +9,17 @@
#include <stdio.h>
#include <stdlib.h>
#include "nsString.h"
#include "nsStringBuffer.h"
#include "nsReadableUtils.h"
#include "UTFStrings.h"
#include "nsUnicharUtils.h"
#include "mozilla/HashFunctions.h"
+#include "nsUTF8Utils.h"
#include "gtest/gtest.h"
using namespace mozilla;
namespace TestUTF {
TEST(UTF, Valid)
@@ -72,59 +73,53 @@ TEST(UTF, Invalid8)
EXPECT_TRUE(tmp16.Equals(NS_LITERAL_STRING("string ") + str16));
EXPECT_EQ(CompareUTF8toUTF16(str8, str16), 0);
}
}
TEST(UTF, Malformed8)
{
-// Don't run this test in debug builds as that intentionally asserts.
-#ifndef DEBUG
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
- nsDependentCString str8(Malformed8Strings[i]);
+ nsDependentString str16(Malformed8Strings[i].m16);
+ nsDependentCString str8(Malformed8Strings[i].m8);
- EXPECT_TRUE(NS_ConvertUTF8toUTF16(str8).IsEmpty());
+ EXPECT_TRUE(NS_ConvertUTF8toUTF16(str8).Equals(str16));
- nsString tmp16(NS_LITERAL_STRING("string"));
+ nsString tmp16(NS_LITERAL_STRING("string "));
AppendUTF8toUTF16(str8, tmp16);
- EXPECT_TRUE(tmp16.EqualsLiteral("string"));
+ EXPECT_TRUE(tmp16.Equals(NS_LITERAL_STRING("string ") + str16));
- EXPECT_NE(CompareUTF8toUTF16(str8, EmptyString()), 0);
+ EXPECT_EQ(CompareUTF8toUTF16(str8, str16), 0);
}
-#endif
}
TEST(UTF, Hash16)
{
for (unsigned int i = 0; i < ArrayLength(ValidStrings); ++i) {
nsDependentCString str8(ValidStrings[i].m8);
bool err;
EXPECT_EQ(HashString(ValidStrings[i].m16),
HashUTF8AsUTF16(str8.get(), str8.Length(), &err));
EXPECT_FALSE(err);
}
for (unsigned int i = 0; i < ArrayLength(Invalid8Strings); ++i) {
nsDependentCString str8(Invalid8Strings[i].m8);
bool err;
- EXPECT_EQ(HashString(Invalid8Strings[i].m16),
- HashUTF8AsUTF16(str8.get(), str8.Length(), &err));
- EXPECT_FALSE(err);
+ EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
+ EXPECT_TRUE(err);
}
-// Don't run this test in debug builds as that intentionally asserts.
-#ifndef DEBUG
for (unsigned int i = 0; i < ArrayLength(Malformed8Strings); ++i) {
- nsDependentCString str8(Malformed8Strings[i]);
+ nsDependentCString str8(Malformed8Strings[i].m8);
bool err;
EXPECT_EQ(HashUTF8AsUTF16(str8.get(), str8.Length(), &err), 0u);
EXPECT_TRUE(err);
}
-#endif
}
/**
* This tests the handling of a non-ascii character at various locations in a
* UTF-16 string that is being converted to UTF-8.
*/
void NonASCII16_helper(const size_t aStrSize)
{
@@ -173,19 +168,81 @@ void NonASCII16_helper(const size_t aStr
// And finish with the trailing ASCII chars.
expected.Append(asciiCString.BeginReading() + i + 1, kTestSize - i - 1);
EXPECT_STREQ(dest.BeginReading(), expected.BeginReading());
}
}
-TEST(UTF, NonASCII16)
+TEST(UTF, UTF8CharEnumerator)
{
- // Test with various string sizes to catch any special casing.
- NonASCII16_helper(1);
- NonASCII16_helper(8);
- NonASCII16_helper(16);
- NonASCII16_helper(32);
- NonASCII16_helper(512);
+ const char* p = "\x61\xC0\xC2\xC2\x80\xE0\x80\x80\xE0\xA0\x80\xE1\x80\x80\xED\xBF\xBF\xED\x9F\xBF\xEE\x80\x80\xEE\x80\xFF\xF0\x90\x80\x80\xF0\x80\x80\x80\xF1\x80\x80\x80\xF4\x8F\xBF\xF4\x8F\xBF\xBF\xF4\xBF\xBF\xBF";
+ const char* end = p + 49;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0080U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x0800U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x1000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xD7FFU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xE000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x40000U);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0x10FFFFU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xC2";
+ end = p + 1;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xE1\x80";
+ end = p + 2;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ p = "\xF1\x80\x80";
+ end = p + 3;
+ EXPECT_EQ(UTF8CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+}
+
+TEST(UTF, UTF16CharEnumerator)
+{
+ const char16_t* p = u"\u0061\U0001F4A9";
+ const char16_t* end = p + 3;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x1F4A9U);
+ EXPECT_EQ(p, end);
+ const char16_t loneHigh = 0xD83D;
+ p = &loneHigh;
+ end = p + 1;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ const char16_t loneLow = 0xDCA9;
+ p = &loneLow;
+ end = p + 1;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(p, end);
+ const char16_t loneHighStr[] = { 0xD83D, 0x0061 };
+ p = loneHighStr;
+ end = p + 2;
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0xFFFDU);
+ EXPECT_EQ(UTF16CharEnumerator::NextChar(&p, end), 0x0061U);
+ EXPECT_EQ(p, end);
}
} // namespace TestUTF
--- a/xpcom/tests/gtest/UTFStrings.h
+++ b/xpcom/tests/gtest/UTFStrings.h
@@ -56,57 +56,73 @@ static const UTFStringsStringPair Invali
{ { 0xDC00, 0xD800, 0xDC00, 0xD800 },
{ char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80), char(0xEF), char(0xBF), char(0xBD) } },
{ { 0xDC00, 0xD800, 0xD800, 0xDC00 },
{ char(0xEF), char(0xBF), char(0xBD), char(0xEF), char(0xBF), char(0xBD), char(0xF0), char(0x90), char(0x80), char(0x80) } },
};
static const UTFStringsStringPair Invalid8Strings[] =
{
- { { 'a', 0xFFFD, 'b' },
+ { { 'a', 0xFFFD, 0xFFFD, 'b' },
{ 'a', char(0xC0), char(0x80), 'b' } },
- { { 0xFFFD, 0x80 },
+ { { 0xFFFD, 0xFFFD, 0x80 },
{ char(0xC1), char(0xBF), char(0xC2), char(0x80) } },
- { { 0xFFFD },
+ { { 0xFFFD, 0xFFFD },
{ char(0xC1), char(0xBF) } },
- { { 0xFFFD, 'x', 0x0800 },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0x0800 },
{ char(0xE0), char(0x80), char(0x80), 'x', char(0xE0), char(0xA0), char(0x80) } },
- { { 0xFFFD, 'x', 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF0), char(0x80), char(0x80), char(0x80), 'x', char(0xF0), char(0x80), char(0x8F), char(0x80) } },
- { { 0xFFFD, 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF4), char(0x90), char(0x80), char(0x80), char(0xF7), char(0xBF), char(0xBF), char(0xBF) } },
- { { 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xD800, 0xDC00, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF0), char(0x8F), char(0xBF), char(0xBF), 'x', char(0xF0), char(0x90), char(0x80), char(0x80), char(0xF0), char(0x8F), char(0xBF), char(0xBF) } },
- { { 0xFFFD, 'x', 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'x', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xF8), char(0x80), char(0x80), char(0x80), char(0x80), 'x', char(0xF8), char(0x88), char(0x80), char(0x80), char(0x80) } },
- { { 0xFFFD, 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xFB), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xFC), char(0xA0), char(0x80), char(0x80), char(0x80), char(0x80) } },
- { { 0xFFFD, 0xFFFD },
+ { { 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
{ char(0xFC), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0xFD), char(0xBF), char(0xBF), char(0xBF), char(0xBF), char(0xBF) } },
};
-// Don't use this array in debug builds as that intentionally asserts.
-#ifndef DEBUG
-static const char Malformed8Strings[][16] =
+static const UTFStringsStringPair Malformed8Strings[] =
{
- { char(0x80) },
- { 'a', char(0xC8), 'c' },
- { 'a', char(0xC0) },
- { 'a', char(0xE8), 'c' },
- { 'a', char(0xE8), char(0x80), 'c' },
- { 'a', char(0xE8), char(0x80) },
- { char(0xE8), 0x7F, char(0x80) },
- { 'a', char(0xE8), char(0xE8), char(0x80) },
- { 'a', char(0xF4) },
- { 'a', char(0xF4), char(0x80), char(0x80), 'c', 'c' },
- { 'a', char(0xF4), char(0x80), 'x', char(0x80) },
- { char(0xF4), char(0x80), char(0x80), char(0x80), char(0x80) },
- { 'a', char(0xFA), 'c' },
- { 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' },
- { 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
- { 'a', char(0xFD) },
- { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' },
- { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) },
- { 'a', char(0xFC), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' },
+ { { 0xFFFD },
+ { char(0x80) } },
+ { { 'a', 0xFFFD, 'c' },
+ { 'a', char(0xC8), 'c' } },
+ { { 'a', 0xFFFD },
+ { 'a', char(0xC8) } },
+ { { 'a', 0xFFFD, 'c' },
+ { 'a', char(0xE8), 'c' } },
+ { { 'a', 0xFFFD, 'c' },
+ { 'a', char(0xE8), char(0x80), 'c' } },
+ { { 'a', 0xFFFD },
+ { 'a', char(0xE8), char(0x80) } },
+ { { 0xFFFD, 0x7F, 0xFFFD },
+ { char(0xE8), 0x7F, char(0x80) } },
+ { { 'a', 0xFFFD, 0xFFFD },
+ { 'a', char(0xE8), char(0xE8), char(0x80) } },
+ { { 'a', 0xFFFD },
+ { 'a', char(0xF4) } },
+ { { 'a', 0xFFFD, 'c', 'c' },
+ { 'a', char(0xF4), char(0x80), char(0x80), 'c', 'c' } },
+ { { 'a', 0xFFFD, 'x', 0xFFFD },
+ { 'a', char(0xF4), char(0x80), 'x', char(0x80) } },
+ { { 0xDBC0, 0xDC00, 0xFFFD },
+ { char(0xF4), char(0x80), char(0x80), char(0x80), char(0x80) } },
+ { { 'a', 0xFFFD, 'c' },
+ { 'a', char(0xFA), 'c' } },
+ { { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0x7F, 0xFFFD, 'c' },
+ { 'a', char(0xFA), char(0x80), char(0x80), 0x7F, char(0x80), 'c' } },
+ { { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'c' },
+ { 'a', char(0xFA), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), 'c' } },
+ { { 'a', 0xFFFD },
+ { 'a', char(0xFD) } },
+ { { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 'c' },
+ { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), 'c' } },
+ { { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD },
+ { 'a', char(0xFD), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80), char(0x80) } },
+ { { 'a', 0xFFFD, 0xFFFD, 0xFFFD, 0x40, 0xFFFD, 0xFFFD, 'c' },
+ { 'a', char(0xFD), char(0x80), char(0x80), 0x40, char(0x80), char(0x80), 'c' } },
};
-#endif
#endif