Bug 1402247 part 4 - Use encoding_rs::mem for string conversion that yield plain C strings.
MozReview-Commit-ID: 5x2UhkxiE6H
--- a/intl/encoding_glue/src/lib.rs
+++ b/intl/encoding_glue/src/lib.rs
@@ -595,8 +595,28 @@ pub unsafe extern "C" fn encoding_mem_is
pub unsafe extern "C" fn encoding_mem_is_ascii(buffer: *const u8, len: usize) -> bool {
encoding_rs::mem::is_ascii(::std::slice::from_raw_parts(buffer, len))
}
#[no_mangle]
pub unsafe extern "C" fn encoding_mem_is_basic_latin(buffer: *const u16, len: usize) -> bool {
encoding_rs::mem::is_basic_latin(::std::slice::from_raw_parts(buffer, len))
}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_latin1_lossy(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) {
+ encoding_rs::mem::convert_utf16_to_latin1_lossy(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_latin1_to_utf16(src: *const u8, src_len: usize, dst: *mut u16, dst_len: usize) {
+ encoding_rs::mem::convert_latin1_to_utf16(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len));
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf16_to_utf8(src: *const u16, src_len: usize, dst: *mut u8, dst_len: usize) -> usize {
+ encoding_rs::mem::convert_utf16_to_utf8(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn encoding_mem_convert_utf8_to_utf16(src: *const u8, src_len: usize, dst: *mut u16, dst_len: usize) -> usize {
+ encoding_rs::mem::convert_utf8_to_utf16(::std::slice::from_raw_parts(src, src_len), ::std::slice::from_raw_parts_mut(dst, dst_len))
+}
--- a/xpcom/string/moz.build
+++ b/xpcom/string/moz.build
@@ -49,18 +49,16 @@ UNIFIED_SOURCES += [
'precompiled_templates.cpp',
]
# Are we targeting x86 or x86-64? If so, compile the SSE2 functions for
# nsUTF8Utils.cpp and nsReadableUtils.cpp.
if CONFIG['INTEL_ARCHITECTURE']:
SOURCES += ['nsUTF8UtilsSSE2.cpp']
SOURCES['nsUTF8UtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
- SOURCES += ['nsReadableUtilsSSE2.cpp']
- SOURCES['nsReadableUtilsSSE2.cpp'].flags += CONFIG['SSE2_FLAGS']
if CONFIG['BUILD_ARM_NEON'] or CONFIG['CPU_ARCH'] == 'aarch64':
SOURCES += ['nsUTF8UtilsNEON.cpp']
SOURCES['nsUTF8UtilsNEON.cpp'].flags += CONFIG['NEON_FLAGS']
# MSVC 2017 has a bug that incorrectly generates C5037 warning which
# hits the template string code. We need to disable this warning as a
# workaround. See https://developercommunity.visualstudio.com/
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@@ -1,87 +1,28 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsReadableUtils.h"
-#include "nsReadableUtilsImpl.h"
#include <algorithm>
#include "mozilla/CheckedInt.h"
#include "nscore.h"
#include "nsMemory.h"
#include "nsString.h"
#include "nsTArray.h"
#include "nsUTF8Utils.h"
-using mozilla::IsASCII;
-
-/**
- * Fallback implementation for finding the first non-ASCII character in a
- * UTF-16 string.
- */
-static inline int32_t
-FirstNonASCIIUnvectorized(const char16_t* aBegin, const char16_t* aEnd)
-{
- typedef mozilla::NonASCIIParameters<sizeof(size_t)> p;
- const size_t kMask = p::mask();
- const uintptr_t kAlignMask = p::alignMask();
- const size_t kNumUnicharsPerWord = p::numUnicharsPerWord();
-
- const char16_t* idx = aBegin;
-
- // Align ourselves to a word boundary.
- for (; idx != aEnd && ((uintptr_t(idx) & kAlignMask) != 0); idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- // Check one word at a time.
- const char16_t* wordWalkEnd = mozilla::aligned(aEnd, kAlignMask);
- for (; idx != wordWalkEnd; idx += kNumUnicharsPerWord) {
- const size_t word = *reinterpret_cast<const size_t*>(idx);
- if (word & kMask) {
- return idx - aBegin;
- }
- }
-
- // Take care of the remainder one character at a time.
- for (; idx != aEnd; idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- return -1;
-}
-
-/*
- * This function returns -1 if all characters in str are ASCII characters.
- * Otherwise, it returns a value less than or equal to the index of the first
- * ASCII character in str. For example, if first non-ASCII character is at
- * position 25, it may return 25, 24, or 16. But it guarantees
- * there are only ASCII characters before returned value.
- */
-static inline int32_t
-FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd)
-{
-#ifdef MOZILLA_MAY_SUPPORT_SSE2
- if (mozilla::supports_sse2()) {
- return mozilla::SSE2::FirstNonASCII(aBegin, aEnd);
- }
-#endif
-
- return FirstNonASCIIUnvectorized(aBegin, aEnd);
-}
+using mozilla::MakeSpan;
+using mozilla::AsWritableBytes;
/**
* A helper function that allocates a buffer of the desired character type big enough to hold a copy of the supplied string (plus a zero terminator).
*
* @param aSource an string you will eventually be making a copy of
* @return a new buffer (of the type specified by the second parameter) which you must free with |free|.
*
*/
@@ -93,101 +34,105 @@ AllocateStringCopy(const FromStringT& aS
return static_cast<ToCharT*>(moz_xmalloc(
(aSource.Length() + 1) * sizeof(ToCharT)));
}
char*
ToNewCString(const nsAString& aSource)
{
- char* result = AllocateStringCopy(aSource, (char*)0);
- if (!result) {
+ char* dest = AllocateStringCopy(aSource, (char*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsAString::const_iterator fromBegin, fromEnd;
- LossyConvertEncoding16to8 converter(result);
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter).write_terminator();
- return result;
+ auto len = aSource.Length();
+ LossyConvertUTF16toLatin1(aSource, AsWritableBytes(MakeSpan(dest, len)));
+ dest[len] = 0;
+ return dest;
}
char*
ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count)
{
- nsAString::const_iterator start, end;
- CalculateUTF8Size calculator;
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- calculator);
-
- if (aUTF8Count) {
- *aUTF8Count = calculator.Size();
+ auto len = aSource.Length();
+ // The uses of this function seem temporary enough that it's not
+ // worthwhile to be fancy about the allocation size. Let's just use
+ // the worst case.
+ // Times 3 plus 2, because ConvertUTF16toUTF8 requires times 3 plus 1 and
+ // then we have the terminator.
+ mozilla::CheckedInt<size_t> destLen(len);
+ destLen *= 3;
+ destLen += 2;
+ if (!destLen.isValid()) {
+ return nullptr;
}
-
- char* result = static_cast<char*>
- (moz_xmalloc(calculator.Size() + 1));
- if (!result) {
+ size_t destLenVal = destLen.value();
+ if (destLenVal > UINT32_MAX) {
+ return nullptr;
+ }
+ char* dest = static_cast<char*>(moz_xmalloc(destLenVal));
+ if (!dest) {
return nullptr;
}
- ConvertUTF16toUTF8 converter(result);
- copy_string(aSource.BeginReading(start), aSource.EndReading(end),
- converter).write_terminator();
- NS_ASSERTION(calculator.Size() == converter.Size(), "length mismatch");
+ size_t written = ConvertUTF16toUTF8Func(aSource, AsWritableBytes(MakeSpan(dest, destLenVal)));
+ dest[written] = 0;
- return result;
+ if (aUTF8Count) {
+ *aUTF8Count = written;
+ }
+
+ return dest;
}
char*
ToNewCString(const nsACString& aSource)
{
// no conversion needed, just allocate a buffer of the correct length and copy into it
- char* result = AllocateStringCopy(aSource, (char*)0);
- if (!result) {
+ char* dest = AllocateStringCopy(aSource, (char*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsACString::const_iterator fromBegin, fromEnd;
- char* toBegin = result;
- *copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- toBegin) = char(0);
- return result;
+ auto len = aSource.Length();
+ memcpy(dest, aSource.BeginReading(), len * sizeof(char));
+ dest[len] = 0;
+ return dest;
}
char16_t*
ToNewUnicode(const nsAString& aSource)
{
// no conversion needed, just allocate a buffer of the correct length and copy into it
- char16_t* result = AllocateStringCopy(aSource, (char16_t*)0);
- if (!result) {
+ char16_t* dest = AllocateStringCopy(aSource, (char16_t*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsAString::const_iterator fromBegin, fromEnd;
- char16_t* toBegin = result;
- *copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- toBegin) = char16_t(0);
- return result;
+ auto len = aSource.Length();
+ memcpy(dest, aSource.BeginReading(), len * sizeof(char16_t));
+ dest[len] = 0;
+ return dest;
}
char16_t*
ToNewUnicode(const nsACString& aSource)
{
- char16_t* result = AllocateStringCopy(aSource, (char16_t*)0);
- if (!result) {
+ char16_t* dest = AllocateStringCopy(aSource, (char16_t*)nullptr);
+ if (!dest) {
return nullptr;
}
- nsACString::const_iterator fromBegin, fromEnd;
- LossyConvertEncoding8to16 converter(result);
- copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd),
- converter).write_terminator();
- return result;
+ auto len = aSource.Length();
+ ConvertLatin1toUTF16(aSource, MakeSpan(dest, len));
+ dest[len] = 0;
+ return dest;
}
uint32_t
CalcUTF8ToUnicodeLength(const nsACString& aSource)
{
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
@@ -208,31 +153,30 @@ UTF8ToUnicodeBuffer(const nsACString& aS
*aUTF16Count = converter.Length();
}
return aBuffer;
}
char16_t*
UTF8ToNewUnicode(const nsACString& aSource, uint32_t* aUTF16Count)
{
- const uint32_t length = CalcUTF8ToUnicodeLength(aSource);
- const size_t buffer_size = (length + 1) * sizeof(char16_t);
- char16_t* buffer = static_cast<char16_t*>(moz_xmalloc(buffer_size));
- if (!buffer) {
+ char16_t* dest = AllocateStringCopy(aSource, (char16_t*)nullptr);
+ if (!dest) {
return nullptr;
}
- uint32_t copied;
- UTF8ToUnicodeBuffer(aSource, buffer, &copied);
- NS_ASSERTION(length == copied, "length mismatch");
+ auto len = aSource.Length();
+ size_t written = ConvertUTF8toUTF16Func(aSource, MakeSpan(dest, len));
+ dest[written] = 0;
if (aUTF16Count) {
- *aUTF16Count = copied;
+ *aUTF16Count = written;
}
- return buffer;
+
+ return dest;
}
char16_t*
CopyUnicodeTo(const nsAString& aSource, uint32_t aSrcOffset, char16_t* aDest,
uint32_t aLength)
{
nsAString::const_iterator fromBegin, fromEnd;
char16_t* toBegin = aDest;
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@@ -18,28 +18,82 @@
#include "nsTArrayForwardDeclare.h"
// Can't include mozilla/Encoding.h here
extern "C" {
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
bool encoding_mem_is_ascii(uint8_t const* buffer, size_t buffer_len);
bool encoding_mem_is_basic_latin(char16_t const* buffer, size_t buffer_len);
+ void encoding_mem_convert_utf16_to_latin1_lossy(const char16_t* src, size_t src_len, uint8_t* dst, size_t dst_len);
+ void encoding_mem_convert_latin1_to_utf16(const uint8_t* src, size_t src_len, char16_t* dst, size_t dst_len);
+ size_t encoding_mem_convert_utf16_to_utf8(const char16_t* src, size_t src_len, uint8_t* dst, size_t dst_len);
+ size_t encoding_mem_convert_utf8_to_utf16(const uint8_t* src, size_t src_len, char16_t* dst, size_t dst_len);
}
// From the nsstring crate
extern "C" {
bool nsstring_fallible_append_utf8_impl(nsAString* aThis, const uint8_t* aOther, size_t aOtherLen, size_t aOldLen);
bool nsstring_fallible_append_latin1_impl(nsAString* aThis, const uint8_t* aOther, size_t aOtherLen, size_t aOldLen);
bool nscstring_fallible_append_utf16_to_utf8_impl(nsACString* aThis, const char16_t*, size_t aOtherLen, size_t aOldLen);
bool nscstring_fallible_append_utf16_to_latin1_lossy_impl(nsACString* aThis, const char16_t*, size_t aOtherLen, size_t aOldLen);
bool nscstring_fallible_append_utf8_to_latin1_lossy_check(nsACString* aThis, const nsACString* aOther, size_t aOldLen);
bool nscstring_fallible_append_latin1_to_utf8_check(nsACString* aThis, const nsACString* aOther, size_t aOldLen);
}
+/**
+ * If all the code points in the input are below U+0100, converts to Latin1, i.e. unsigned byte value is Unicode
+ * scalar value; not windows-1252. If there are code points above U+00FF, asserts in debug builds and produces
+ * garbage in release builds. The nature of the garbage depends on the CPU architecture and must not be relied upon.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline void
+LossyConvertUTF16toLatin1(mozilla::Span<const char16_t> aSource, mozilla::Span<uint8_t> aDest)
+{
+ encoding_mem_convert_utf16_to_latin1_lossy(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Interprets unsigned byte value as Unicode scalar value (i.e. not windows-1251!).
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ */
+inline void
+ConvertLatin1toUTF16(mozilla::Span<const uint8_t> aSource, mozilla::Span<char16_t> aDest)
+{
+ encoding_mem_convert_latin1_to_utf16(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Lone surrogates are replaced with the REPLACEMENT CHARACTER.
+ *
+ * The length of aDest must be at least the length of aSource times three _plus one_.
+ *
+ * Returns the number of code units written.
+ */
+inline size_t
+ConvertUTF16toUTF8Func(mozilla::Span<const char16_t> aSource, mozilla::Span<uint8_t> aDest)
+{
+ return encoding_mem_convert_utf16_to_utf8(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
+/**
+ * Malformed byte sequences are replaced with the REPLACEMENT CHARACTER.
+ *
+ * The length of aDest must be not be less than the length of aSource.
+ *
+ * Returns the number of code units written.
+ */
+inline size_t
+ConvertUTF8toUTF16Func(mozilla::Span<const uint8_t> aSource, mozilla::Span<char16_t> aDest)
+{
+ return encoding_mem_convert_utf8_to_utf16(aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
+}
+
inline size_t
Distance(const nsReadingIterator<char16_t>& aStart,
const nsReadingIterator<char16_t>& aEnd)
{
MOZ_ASSERT(aStart.get() <= aEnd.get());
return static_cast<size_t>(aEnd.get() - aStart.get());
}
inline size_t
@@ -96,16 +150,17 @@ inline MOZ_MUST_USE bool CopyUTF8toUTF16
inline void CopyUTF8toUTF16(const char* aSource,
nsAString& aDest)
{
if (aSource) {
CopyUTF8toUTF16(mozilla::AsBytes(mozilla::MakeStringSpan(aSource)), aDest);
}
}
+
inline MOZ_MUST_USE bool AppendUTF8toUTF16(const char* aSource,
nsAString& aDest,
const mozilla::fallible_t&)
{
if (aSource) {
return AppendUTF8toUTF16(mozilla::AsBytes(mozilla::MakeStringSpan(aSource)), aDest, mozilla::fallible);
}
return true;
@@ -436,19 +491,20 @@ char16_t* UTF8ToUnicodeBuffer(const nsAC
uint32_t* aUTF16Count = nullptr);
/**
* Returns a new |char16_t| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |free|. Performs an encoding conversion from UTF-8 to UTF-16
- * while copying |aSource| to your new buffer. This conversion is well defined
- * for a valid UTF-8 string. The new buffer is zero-terminated, but that
- * may not help you if |aSource| contains embedded nulls.
+ * while copying |aSource| to your new buffer. Malformed byte sequences
+ * are replaced with the REPLACEMENT CHARACTER. The new buffer is
+ * zero-terminated, but that may not help you if |aSource| contains
+ * embedded nulls.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @param aUTF16Count the number of 16-bit units that was returned
* @return a new |char16_t| buffer you must free with |free|.
* (UTF-16 encoded)
*/
char16_t* UTF8ToNewUnicode(const nsACString& aSource,
uint32_t* aUTF16Count = nullptr);
deleted file mode 100644
--- a/xpcom/string/nsReadableUtilsImpl.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* vim: set ts=8 sts=2 et sw=2 tw=80: */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include <stdint.h>
-
-namespace mozilla {
-
-inline bool IsASCII(char16_t aChar) {
- return (aChar & 0xFF80) == 0;
-}
-
-/**
- * Provides a pointer before or equal to |aPtr| that is is suitably aligned.
- */
-inline const char16_t* aligned(const char16_t* aPtr, const uintptr_t aMask)
-{
- return reinterpret_cast<const char16_t*>(
- reinterpret_cast<uintptr_t>(aPtr) & ~aMask);
-}
-
-/**
- * Structures for word-sized vectorization of ASCII checking for UTF-16
- * strings.
- */
-template<size_t size> struct NonASCIIParameters;
-template<> struct NonASCIIParameters<4> {
- static inline size_t mask() { return 0xff80ff80; }
- static inline uintptr_t alignMask() { return 0x3; }
- static inline size_t numUnicharsPerWord() { return 2; }
-};
-
-template<> struct NonASCIIParameters<8> {
- static inline size_t mask() {
- static const uint64_t maskAsUint64 = UINT64_C(0xff80ff80ff80ff80);
- // We have to explicitly cast this 64-bit value to a size_t, or else
- // compilers for 32-bit platforms will warn about it being too large to fit
- // in the size_t return type. (Fortunately, this code isn't actually
- // invoked on 32-bit platforms -- they'll use the <4> specialization above.
- // So it is, in fact, OK that this value is too large for a 32-bit size_t.)
- return (size_t)maskAsUint64;
- }
- static inline uintptr_t alignMask() { return 0x7; }
- static inline size_t numUnicharsPerWord() { return 4; }
-};
-
-namespace SSE2 {
-
-int32_t FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd);
-
-} // namespace SSE2
-} // namespace mozilla
deleted file mode 100644
--- a/xpcom/string/nsReadableUtilsSSE2.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* vim: set ts=8 sts=2 et sw=2 tw=80: */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include <emmintrin.h>
-
-#include "nsReadableUtilsImpl.h"
-
-namespace mozilla {
-namespace SSE2 {
-
-static inline bool
-is_zero (__m128i x)
-{
- return
- _mm_movemask_epi8(_mm_cmpeq_epi8(x, _mm_setzero_si128())) == 0xffff;
-}
-
-int32_t
-FirstNonASCII(const char16_t* aBegin, const char16_t* aEnd)
-{
- const size_t kNumUnicharsPerVector = sizeof(__m128i) / sizeof(char16_t);
- typedef NonASCIIParameters<sizeof(size_t)> p;
- const size_t kMask = p::mask();
- const uintptr_t kXmmAlignMask = 0xf;
- const uint16_t kShortMask = 0xff80;
- const size_t kNumUnicharsPerWord = p::numUnicharsPerWord();
-
- const char16_t* idx = aBegin;
-
- // Align ourselves to a 16-byte boundary as required by _mm_load_si128
- for (; idx != aEnd && ((uintptr_t(idx) & kXmmAlignMask) != 0); idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- // Check one XMM register (16 bytes) at a time.
- const char16_t* vectWalkEnd = aligned(aEnd, kXmmAlignMask);
- __m128i vectmask = _mm_set1_epi16(static_cast<int16_t>(kShortMask));
- for (; idx != vectWalkEnd; idx += kNumUnicharsPerVector) {
- const __m128i vect = *reinterpret_cast<const __m128i*>(idx);
- if (!is_zero(_mm_and_si128(vect, vectmask))) {
- return idx - aBegin;
- }
- }
-
- // Check one word at a time.
- const char16_t* wordWalkEnd = aligned(aEnd, p::alignMask());
- for(; idx != wordWalkEnd; idx += kNumUnicharsPerWord) {
- const size_t word = *reinterpret_cast<const size_t*>(idx);
- if (word & kMask) {
- return idx - aBegin;
- }
- }
-
- // Take care of the remainder one character at a time.
- for (; idx != aEnd; idx++) {
- if (!IsASCII(*idx)) {
- return idx - aBegin;
- }
- }
-
- return -1;
-}
-
-} // namespace SSE2
-} // namespace mozilla