mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-02 22:30:31 +00:00
AK: Replace converting to and from UTF-16 with simdutf
The one behavior difference is that we will now actually fail on invalid code units with Utf16View::to_utf8(AllowInvalidCodeUnits::No). It was arguably a bug that this wasn't already the case.
This commit is contained in:
parent
32ffe9bbfc
commit
0c14a9417a
Notes:
sideshowbarker
2024-07-18 23:45:58 +09:00
Author: https://github.com/trflynn89
Commit: 0c14a9417a
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/674
Reviewed-by: https://github.com/ADKaster
4 changed files with 81 additions and 23 deletions
|
@ -4,6 +4,8 @@
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define AK_DONT_REPLACE_STD
|
||||||
|
|
||||||
#include <AK/Array.h>
|
#include <AK/Array.h>
|
||||||
#include <AK/Checked.h>
|
#include <AK/Checked.h>
|
||||||
#include <AK/FlyString.h>
|
#include <AK/FlyString.h>
|
||||||
|
@ -11,9 +13,12 @@
|
||||||
#include <AK/MemMem.h>
|
#include <AK/MemMem.h>
|
||||||
#include <AK/Stream.h>
|
#include <AK/Stream.h>
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
|
#include <AK/Utf16View.h>
|
||||||
#include <AK/Vector.h>
|
#include <AK/Vector.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <simdutf.h>
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
String String::from_utf8_without_validation(ReadonlyBytes bytes)
|
String String::from_utf8_without_validation(ReadonlyBytes bytes)
|
||||||
|
@ -39,6 +44,30 @@ ErrorOr<String> String::from_utf8(StringView view)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||||
|
{
|
||||||
|
if (!utf16.validate())
|
||||||
|
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
|
||||||
|
|
||||||
|
String result;
|
||||||
|
|
||||||
|
auto utf8_length = simdutf::utf8_length_from_utf16(
|
||||||
|
reinterpret_cast<char16_t const*>(utf16.data()),
|
||||||
|
utf16.length_in_code_units());
|
||||||
|
|
||||||
|
TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
|
||||||
|
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(
|
||||||
|
reinterpret_cast<char16_t const*>(utf16.data()),
|
||||||
|
utf16.length_in_code_units(),
|
||||||
|
reinterpret_cast<char*>(buffer.data()));
|
||||||
|
ASSERT(result == buffer.size());
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
|
ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
|
||||||
{
|
{
|
||||||
String result;
|
String result;
|
||||||
|
|
|
@ -50,12 +50,16 @@ public:
|
||||||
|
|
||||||
// Creates a new String from a sequence of UTF-8 encoded code points.
|
// Creates a new String from a sequence of UTF-8 encoded code points.
|
||||||
static ErrorOr<String> from_utf8(StringView);
|
static ErrorOr<String> from_utf8(StringView);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
|
requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
|
||||||
static ErrorOr<String> from_utf8(T&&) = delete;
|
static ErrorOr<String> from_utf8(T&&) = delete;
|
||||||
|
|
||||||
[[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
|
[[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
|
||||||
|
|
||||||
|
// Creates a new String from a sequence of UTF-16 encoded code points.
|
||||||
|
static ErrorOr<String> from_utf16(Utf16View const&);
|
||||||
|
|
||||||
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
||||||
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ static constexpr u32 replacement_code_point = 0xfffd;
|
||||||
static constexpr u32 first_supplementary_plane_code_point = 0x10000;
|
static constexpr u32 first_supplementary_plane_code_point = 0x10000;
|
||||||
|
|
||||||
template<OneOf<Utf8View, Utf32View> UtfViewType>
|
template<OneOf<Utf8View, Utf32View> UtfViewType>
|
||||||
static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
|
static ErrorOr<Utf16Data> to_utf16_slow(UtfViewType const& view)
|
||||||
{
|
{
|
||||||
Utf16Data utf16_data;
|
Utf16Data utf16_data;
|
||||||
TRY(utf16_data.try_ensure_capacity(view.length()));
|
TRY(utf16_data.try_ensure_capacity(view.length()));
|
||||||
|
@ -39,17 +39,45 @@ static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
|
||||||
|
|
||||||
ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
|
ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
|
||||||
{
|
{
|
||||||
return to_utf16_impl(Utf8View { utf8_view });
|
return utf8_to_utf16(Utf8View { utf8_view });
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
|
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
|
||||||
{
|
{
|
||||||
return to_utf16_impl(utf8_view);
|
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
||||||
|
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
|
||||||
|
return to_utf16_slow(utf8_view);
|
||||||
|
|
||||||
|
Utf16Data utf16_data;
|
||||||
|
|
||||||
|
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(
|
||||||
|
reinterpret_cast<char const*>(utf8_view.bytes()),
|
||||||
|
utf8_view.byte_length())));
|
||||||
|
|
||||||
|
[[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(
|
||||||
|
reinterpret_cast<char const*>(utf8_view.bytes()),
|
||||||
|
utf8_view.byte_length(),
|
||||||
|
reinterpret_cast<char16_t*>(utf16_data.data()));
|
||||||
|
ASSERT(result == utf16_data.size());
|
||||||
|
|
||||||
|
return utf16_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
|
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
|
||||||
{
|
{
|
||||||
return to_utf16_impl(utf32_view);
|
Utf16Data utf16_data;
|
||||||
|
|
||||||
|
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(
|
||||||
|
reinterpret_cast<char32_t const*>(utf32_view.code_points()),
|
||||||
|
utf32_view.length())));
|
||||||
|
|
||||||
|
[[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(
|
||||||
|
reinterpret_cast<char32_t const*>(utf32_view.code_points()),
|
||||||
|
utf32_view.length(),
|
||||||
|
reinterpret_cast<char16_t*>(utf16_data.data()));
|
||||||
|
ASSERT(result == utf16_data.size());
|
||||||
|
|
||||||
|
return utf16_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
|
ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
|
||||||
|
@ -92,30 +120,27 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invali
|
||||||
|
|
||||||
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||||
{
|
{
|
||||||
|
if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
|
||||||
|
return String::from_utf16(*this);
|
||||||
|
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
|
|
||||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
|
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
|
||||||
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
|
if (is_high_surrogate(*ptr)) {
|
||||||
if (is_high_surrogate(*ptr)) {
|
auto const* next = ptr + 1;
|
||||||
auto const* next = ptr + 1;
|
|
||||||
|
|
||||||
if ((next < end_ptr()) && is_low_surrogate(*next)) {
|
if ((next < end_ptr()) && is_low_surrogate(*next)) {
|
||||||
auto code_point = decode_surrogate_pair(*ptr, *next);
|
auto code_point = decode_surrogate_pair(*ptr, *next);
|
||||||
TRY(builder.try_append_code_point(code_point));
|
TRY(builder.try_append_code_point(code_point));
|
||||||
++ptr;
|
++ptr;
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
|
|
||||||
}
|
}
|
||||||
return builder.to_string_without_validation();
|
|
||||||
|
TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto code_point : *this)
|
return builder.to_string_without_validation();
|
||||||
TRY(builder.try_append_code_point(code_point));
|
|
||||||
|
|
||||||
return builder.to_string();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16View::length_in_code_points() const
|
size_t Utf16View::length_in_code_points() const
|
||||||
|
|
|
@ -63,7 +63,7 @@ TEST_CASE(encode_utf8)
|
||||||
auto encoded = Array { (u16)0xd83d };
|
auto encoded = Array { (u16)0xd83d };
|
||||||
Utf16View view { encoded };
|
Utf16View view { encoded };
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), "\ufffd"sv);
|
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,7 +307,7 @@ TEST_CASE(substring_view)
|
||||||
|
|
||||||
EXPECT(view.length_in_code_units() == 1);
|
EXPECT(view.length_in_code_units() == 1);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), "\ufffd"sv);
|
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue