mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-30 20:59:16 +00:00
AK+LibJS+LibWeb+LibRegex: Replace AK::Utf16Data with AK::Utf16String
This commit is contained in:
parent
a43cb15e81
commit
9582895759
Notes:
github-actions[bot]
2025-07-18 16:46:53 +00:00
Author: https://github.com/trflynn89
Commit: 9582895759
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5388
Reviewed-by: https://github.com/shannonbooth ✅
22 changed files with 101 additions and 222 deletions
|
@ -67,11 +67,6 @@ ErrorOr<String> String::from_utf8(StringView view)
|
|||
return result;
|
||||
}
|
||||
|
||||
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||
{
|
||||
return utf16.to_utf8();
|
||||
}
|
||||
|
||||
ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes)
|
||||
{
|
||||
if (bytes.is_empty())
|
||||
|
@ -80,7 +75,7 @@ ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes b
|
|||
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||
auto utf16_length = bytes.size() / 2;
|
||||
|
||||
Utf16Data well_formed_utf16;
|
||||
Vector<char16_t> well_formed_utf16;
|
||||
|
||||
if (!validate_utf16_le(bytes)) {
|
||||
well_formed_utf16.resize(bytes.size());
|
||||
|
@ -109,7 +104,7 @@ ErrorOr<String> String::from_utf16_be_with_replacement_character(ReadonlyBytes b
|
|||
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||
auto utf16_length = bytes.size() / 2;
|
||||
|
||||
Utf16Data well_formed_utf16;
|
||||
Vector<char16_t> well_formed_utf16;
|
||||
|
||||
if (!validate_utf16_le(bytes)) {
|
||||
well_formed_utf16.resize(bytes.size());
|
||||
|
|
|
@ -69,7 +69,6 @@ public:
|
|||
[[nodiscard]] static String from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder&);
|
||||
|
||||
// Creates a new String from a sequence of UTF-16 encoded code points.
|
||||
static ErrorOr<String> from_utf16(Utf16View const&);
|
||||
static ErrorOr<String> from_utf16_le_with_replacement_character(ReadonlyBytes);
|
||||
static ErrorOr<String> from_utf16_be_with_replacement_character(ReadonlyBytes);
|
||||
|
||||
|
|
|
@ -10,77 +10,12 @@
|
|||
#include <AK/StringView.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
template<OneOf<Utf8View, Utf32View> UtfViewType>
|
||||
static ErrorOr<Utf16ConversionResult> to_utf16_slow(UtfViewType const& view)
|
||||
{
|
||||
Utf16Data utf16_data;
|
||||
TRY(utf16_data.try_ensure_capacity(view.length()));
|
||||
|
||||
size_t code_point_count = 0;
|
||||
for (auto code_point : view) {
|
||||
TRY(UnicodeUtils::try_code_point_to_utf16(code_point, [&](auto code_unit) -> ErrorOr<void> {
|
||||
TRY(utf16_data.try_append(code_unit));
|
||||
return {};
|
||||
}));
|
||||
|
||||
code_point_count++;
|
||||
}
|
||||
|
||||
return Utf16ConversionResult { move(utf16_data), code_point_count };
|
||||
}
|
||||
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView utf8_view)
|
||||
{
|
||||
return utf8_to_utf16(Utf8View { utf8_view });
|
||||
}
|
||||
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const& utf8_view)
|
||||
{
|
||||
if (utf8_view.is_empty())
|
||||
return Utf16ConversionResult { Utf16Data {}, 0 };
|
||||
|
||||
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
||||
if (!utf8_view.validate(AllowLonelySurrogates::No)) [[unlikely]]
|
||||
return to_utf16_slow(utf8_view);
|
||||
|
||||
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
|
||||
auto length = utf8_view.byte_length();
|
||||
|
||||
Utf16Data utf16_data;
|
||||
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length)));
|
||||
// FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again.
|
||||
auto code_point_length = simdutf::count_utf8(data, length);
|
||||
|
||||
[[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(data, length, reinterpret_cast<char16_t*>(utf16_data.data()));
|
||||
ASSERT(result == utf16_data.size());
|
||||
|
||||
return Utf16ConversionResult { utf16_data, code_point_length };
|
||||
}
|
||||
|
||||
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
|
||||
{
|
||||
if (utf32_view.is_empty())
|
||||
return Utf16ConversionResult { Utf16Data {}, 0 };
|
||||
|
||||
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
|
||||
auto length = utf32_view.length();
|
||||
|
||||
Utf16Data utf16_data;
|
||||
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(data, length)));
|
||||
|
||||
[[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(data, length, reinterpret_cast<char16_t*>(utf16_data.data()));
|
||||
ASSERT(result == utf16_data.size());
|
||||
|
||||
return Utf16ConversionResult { utf16_data, length };
|
||||
}
|
||||
|
||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
|
|
|
@ -23,16 +23,6 @@
|
|||
|
||||
namespace AK {
|
||||
|
||||
using Utf16Data = Vector<char16_t, 1>;
|
||||
|
||||
struct Utf16ConversionResult {
|
||||
Utf16Data data;
|
||||
size_t code_point_count;
|
||||
};
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView);
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&);
|
||||
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
|
||||
|
||||
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
||||
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
|
||||
|
||||
|
@ -156,13 +146,6 @@ public:
|
|||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
constexpr Utf16View(Utf16Data const& string)
|
||||
: m_string { .utf16 = string.data() }
|
||||
, m_length_in_code_units(string.size())
|
||||
{
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
consteval Utf16View(StringView string)
|
||||
: m_string { .ascii = string.characters_without_null_termination() }
|
||||
, m_length_in_code_units(string.length())
|
||||
|
@ -170,15 +153,6 @@ public:
|
|||
VERIFY(all_of(string, AK::is_ascii));
|
||||
}
|
||||
|
||||
Utf16View(Utf16ConversionResult&&) = delete;
|
||||
explicit Utf16View(Utf16ConversionResult const& conversion_result)
|
||||
: m_string { .utf16 = conversion_result.data.data() }
|
||||
, m_length_in_code_units(conversion_result.data.size())
|
||||
, m_length_in_code_points(conversion_result.code_point_count)
|
||||
{
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
ErrorOr<String> to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||
ErrorOr<ByteString> to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||
|
||||
|
@ -314,18 +288,6 @@ public:
|
|||
return m_length_in_code_points;
|
||||
}
|
||||
|
||||
constexpr Optional<size_t> length_in_code_points_if_known() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return m_length_in_code_units;
|
||||
|
||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||
return {};
|
||||
return m_length_in_code_points;
|
||||
}
|
||||
|
||||
constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
|
||||
|
||||
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
|
||||
{
|
||||
VERIFY(index < length_in_code_units());
|
||||
|
@ -591,6 +553,5 @@ inline constexpr bool IsHashCompatible<Utf16String, Utf16View> = true;
|
|||
}
|
||||
|
||||
#if USING_AK_GLOBALLY
|
||||
using AK::Utf16Data;
|
||||
using AK::Utf16View;
|
||||
#endif
|
||||
|
|
|
@ -111,7 +111,7 @@ ErrorOr<String> Process::get_name()
|
|||
if (!length)
|
||||
return Error::from_windows_error();
|
||||
|
||||
return String::from_utf16(Utf16View { reinterpret_cast<char16_t const*>(path), length });
|
||||
return MUST(Utf16View { reinterpret_cast<char16_t const*>(path), length }.to_utf8());
|
||||
}
|
||||
|
||||
ErrorOr<void> Process::set_name(StringView, SetThreadName)
|
||||
|
|
|
@ -559,7 +559,7 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::encode_uri_component)
|
|||
JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
|
||||
{
|
||||
// 1. Set string to ? ToString(string).
|
||||
auto string = TRY(vm.argument(0).to_byte_string(vm));
|
||||
auto string = TRY(vm.argument(0).to_utf16_string(vm));
|
||||
|
||||
// 3. Let R be the empty String.
|
||||
StringBuilder escaped;
|
||||
|
@ -570,29 +570,29 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
|
|||
// 2. Let length be the length of string.
|
||||
// 5. Let k be 0.
|
||||
// 6. Repeat, while k < length,
|
||||
auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string));
|
||||
for (auto code_point : utf16_conversion.data) {
|
||||
for (size_t k = 0; k < string.length_in_code_units(); ++k) {
|
||||
// a. Let char be the code unit at index k within string.
|
||||
auto code_unit = string.code_unit_at(k);
|
||||
|
||||
// b. If unescapedSet contains char, then
|
||||
// NOTE: We know unescapedSet is ASCII-only, so ensure we have an ASCII codepoint before casting to char.
|
||||
if (is_ascii(code_point) && unescaped_set.contains(static_cast<char>(code_point))) {
|
||||
if (is_ascii(code_unit) && unescaped_set.contains(static_cast<char>(code_unit))) {
|
||||
// i. Let S be the String value containing the single code unit char.
|
||||
escaped.append(code_point);
|
||||
escaped.append(static_cast<char>(code_unit));
|
||||
}
|
||||
// c. Else,
|
||||
// i. Let n be the numeric value of char.
|
||||
// ii. If n < 256, then
|
||||
else if (code_point < 256) {
|
||||
else if (code_unit < 256) {
|
||||
// 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number.
|
||||
// 2. Let S be the string-concatenation of "%" and ! StringPad(hex, 2𝔽, "0", start).
|
||||
escaped.appendff("%{:02X}", code_point);
|
||||
escaped.appendff("%{:02X}", code_unit);
|
||||
}
|
||||
// iii. Else,
|
||||
else {
|
||||
// 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number.
|
||||
// 2. Let S be the string-concatenation of "%u" and ! StringPad(hex, 4𝔽, "0", start).
|
||||
escaped.appendff("%u{:04X}", code_point);
|
||||
escaped.appendff("%u{:04X}", code_unit);
|
||||
}
|
||||
|
||||
// d. Set R to the string-concatenation of R and S.
|
||||
|
|
|
@ -93,26 +93,21 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
|
|||
if (unicode && unicode_sets)
|
||||
return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) };
|
||||
|
||||
auto utf16_pattern_result = AK::utf8_to_utf16(pattern);
|
||||
if (utf16_pattern_result.is_error())
|
||||
return ParseRegexPatternError { "Out of memory"_string };
|
||||
|
||||
auto utf16_result = utf16_pattern_result.release_value();
|
||||
Utf16View utf16_pattern_view { utf16_result };
|
||||
auto utf16_pattern = Utf16String::from_utf8(pattern);
|
||||
StringBuilder builder;
|
||||
|
||||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
||||
auto previous_code_unit_was_backslash = false;
|
||||
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
||||
for (size_t i = 0; i < utf16_pattern.length_in_code_units();) {
|
||||
if (unicode || unicode_sets) {
|
||||
auto code_point = code_point_at(utf16_pattern_view, i);
|
||||
auto code_point = code_point_at(utf16_pattern, i);
|
||||
builder.append_code_point(code_point.code_point);
|
||||
i += code_point.code_unit_count;
|
||||
continue;
|
||||
}
|
||||
|
||||
u16 code_unit = utf16_pattern_view.code_unit_at(i);
|
||||
u16 code_unit = utf16_pattern.code_unit_at(i);
|
||||
++i;
|
||||
|
||||
if (code_unit > 0x7f) {
|
||||
|
|
|
@ -512,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
Optional<ByteString> str;
|
||||
Utf16Data utf16;
|
||||
Utf16String utf16;
|
||||
Vector<u32> data;
|
||||
data.ensure_capacity(length);
|
||||
for (size_t i = offset; i < offset + length; ++i)
|
||||
|
|
|
@ -8,14 +8,15 @@
|
|||
|
||||
#include "Forward.h"
|
||||
#include "RegexOptions.h"
|
||||
#include <AK/Error.h>
|
||||
|
||||
#include <AK/ByteString.h>
|
||||
#include <AK/COWVector.h>
|
||||
#include <AK/Error.h>
|
||||
#include <AK/FlyString.h>
|
||||
#include <AK/MemMem.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
@ -110,7 +111,7 @@ public:
|
|||
return view;
|
||||
}
|
||||
|
||||
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
|
||||
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16String& optional_utf16_storage) const
|
||||
{
|
||||
auto view = m_view.visit(
|
||||
[&optional_string_storage, data]<typename T>(T const&) {
|
||||
|
@ -121,11 +122,8 @@ public:
|
|||
return RegexStringView { T { *optional_string_storage } };
|
||||
},
|
||||
[&optional_utf16_storage, data](Utf16View) {
|
||||
auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
|
||||
optional_utf16_storage = conversion_result.data;
|
||||
auto view = Utf16View { optional_utf16_storage };
|
||||
view.unsafe_set_code_point_length(conversion_result.code_point_count);
|
||||
return RegexStringView { view };
|
||||
optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() });
|
||||
return RegexStringView { optional_utf16_storage.utf16_view() };
|
||||
});
|
||||
|
||||
view.set_unicode(unicode());
|
||||
|
|
|
@ -46,9 +46,8 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
|
|||
{
|
||||
// 1. Let length be node’s length.
|
||||
// FIXME: This is very inefficient!
|
||||
auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
|
||||
Utf16View utf16_view { utf16_result };
|
||||
auto length = utf16_view.length_in_code_units();
|
||||
auto utf16_string = Utf16String::from_utf8(m_data);
|
||||
auto length = utf16_string.length_in_code_units();
|
||||
|
||||
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
|
||||
if (offset > length)
|
||||
|
@ -57,10 +56,10 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
|
|||
// 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
|
||||
// to the end of node’s data, and then return.
|
||||
if (offset + count > length)
|
||||
return MUST(utf16_view.substring_view(offset).to_utf8());
|
||||
return MUST(utf16_string.substring_view(offset).to_utf8());
|
||||
|
||||
// 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data.
|
||||
return MUST(utf16_view.substring_view(offset, count).to_utf8());
|
||||
return MUST(utf16_string.substring_view(offset, count).to_utf8());
|
||||
}
|
||||
|
||||
// https://dom.spec.whatwg.org/#concept-cd-replace
|
||||
|
@ -68,9 +67,8 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
{
|
||||
// 1. Let length be node’s length.
|
||||
// FIXME: This is very inefficient!
|
||||
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
|
||||
Utf16View utf16_view { utf16_data };
|
||||
auto length = utf16_view.length_in_code_units();
|
||||
auto utf16_string = Utf16String::from_utf8(m_data);
|
||||
auto length = utf16_string.length_in_code_units();
|
||||
|
||||
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
|
||||
if (offset > length)
|
||||
|
@ -83,17 +81,17 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
// 5. Insert data into node’s data after offset code units.
|
||||
// 6. Let delete offset be offset + data’s length.
|
||||
// 7. Starting from delete offset code units, remove count code units from node’s data.
|
||||
auto before_data = utf16_view.substring_view(0, offset);
|
||||
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
|
||||
auto after_data = utf16_view.substring_view(offset + count);
|
||||
auto before_data = utf16_string.substring_view(0, offset);
|
||||
auto inserted_data = Utf16String::from_utf8(data);
|
||||
auto after_data = utf16_string.substring_view(offset + count);
|
||||
|
||||
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data.length_in_code_units() + after_data.length_in_code_units());
|
||||
full_data.append(before_data);
|
||||
full_data.append(inserted_data_result.data);
|
||||
full_data.append(inserted_data);
|
||||
full_data.append(after_data);
|
||||
auto full_view = full_data.utf16_string_view();
|
||||
|
||||
bool characters_are_the_same = utf16_view == full_view;
|
||||
auto full_view = full_data.utf16_string_view();
|
||||
bool characters_are_the_same = utf16_string == full_view;
|
||||
auto old_data = m_data;
|
||||
|
||||
// OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
|
||||
|
@ -123,14 +121,14 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
// start offset by data’s length and decrease it by count.
|
||||
for (auto* range : Range::live_ranges()) {
|
||||
if (range->start_container() == this && range->start_offset() > (offset + count))
|
||||
range->set_start_offset(range->start_offset() + inserted_data_result.data.size() - count);
|
||||
range->set_start_offset(range->start_offset() + inserted_data.length_in_code_units() - count);
|
||||
}
|
||||
|
||||
// 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end
|
||||
// offset by data’s length and decrease it by count.
|
||||
for (auto* range : Range::live_ranges()) {
|
||||
if (range->end_container() == this && range->end_offset() > (offset + count))
|
||||
range->set_end_offset(range->end_offset() + inserted_data_result.data.size() - count);
|
||||
range->set_end_offset(range->end_offset() + inserted_data.length_in_code_units() - count);
|
||||
}
|
||||
|
||||
// 12. If node’s parent is non-null, then run the children changed steps for node’s parent.
|
||||
|
|
|
@ -6158,8 +6158,7 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
|
|||
if (text_blocks.is_empty())
|
||||
return {};
|
||||
|
||||
auto utf16_query = MUST(AK::utf8_to_utf16(query));
|
||||
Utf16View query_view { utf16_query };
|
||||
auto utf16_query = Utf16String::from_utf8(query);
|
||||
|
||||
Vector<GC::Root<Range>> matches;
|
||||
for (auto const& text_block : text_blocks) {
|
||||
|
@ -6169,8 +6168,8 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
|
|||
auto* match_start_position = text_block.positions.data();
|
||||
while (true) {
|
||||
auto match_index = case_sensitivity == CaseSensitivity::CaseInsensitive
|
||||
? text_view.find_code_unit_offset_ignoring_case(query_view, offset)
|
||||
: text_view.find_code_unit_offset(query_view, offset);
|
||||
? text_view.find_code_unit_offset_ignoring_case(utf16_query, offset)
|
||||
: text_view.find_code_unit_offset(utf16_query, offset);
|
||||
if (!match_index.has_value())
|
||||
break;
|
||||
|
||||
|
@ -6181,15 +6180,15 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
|
|||
auto& start_dom_node = match_start_position->dom_node;
|
||||
|
||||
auto* match_end_position = match_start_position;
|
||||
for (; i < text_block.positions.size() - 1 && (match_index.value() + query_view.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i)
|
||||
for (; i < text_block.positions.size() - 1 && (match_index.value() + utf16_query.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i)
|
||||
match_end_position = &text_block.positions[i + 1];
|
||||
|
||||
auto& end_dom_node = match_end_position->dom_node;
|
||||
auto end_position = match_index.value() + query_view.length_in_code_units() - match_end_position->start_offset;
|
||||
auto end_position = match_index.value() + utf16_query.length_in_code_units() - match_end_position->start_offset;
|
||||
|
||||
matches.append(Range::create(start_dom_node, start_position, end_dom_node, end_position));
|
||||
match_start_position = match_end_position;
|
||||
offset = match_index.value() + query_view.length_in_code_units() + 1;
|
||||
offset = match_index.value() + utf16_query.length_in_code_units() + 1;
|
||||
if (offset >= text_view.length_in_code_units())
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -384,9 +384,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
|
|||
auto parent_white_space_collapse = resolved_keyword(*start_node->parent(), CSS::PropertyID::WhiteSpaceCollapse);
|
||||
|
||||
// FIXME: Find a way to get code points directly from the UTF-8 string
|
||||
auto start_node_data = *start_node->text_content();
|
||||
auto utf16_code_units = MUST(AK::utf8_to_utf16(start_node_data));
|
||||
auto offset_minus_one_code_point = Utf16View { utf16_code_units }.code_point_at(start_offset - 1);
|
||||
auto start_node_data = Utf16String::from_utf8(*start_node->text_content());
|
||||
auto offset_minus_one_code_point = start_node_data.code_point_at(start_offset - 1);
|
||||
|
||||
if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_minus_one_code_point == 0x20 || offset_minus_one_code_point == 0xA0)) {
|
||||
--start_offset;
|
||||
continue;
|
||||
|
@ -437,9 +437,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
|
|||
auto parent_white_space_collapse = resolved_keyword(*end_node->parent(), CSS::PropertyID::WhiteSpaceCollapse);
|
||||
|
||||
// FIXME: Find a way to get code points directly from the UTF-8 string
|
||||
auto end_node_data = *end_node->text_content();
|
||||
auto utf16_code_units = MUST(AK::utf8_to_utf16(end_node_data));
|
||||
auto offset_code_point = Utf16View { utf16_code_units }.code_point_at(end_offset);
|
||||
auto end_node_data = Utf16String::from_utf8(*end_node->text_content());
|
||||
auto offset_code_point = end_node_data.code_point_at(end_offset);
|
||||
|
||||
if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_code_point == 0x20 || offset_code_point == 0xA0)) {
|
||||
// 1. If fix collapsed space is true, and collapse spaces is true, and the end offsetth
|
||||
// code unit of end node's data is a space (0x0020): call deleteData(end offset, 1)
|
||||
|
@ -556,16 +556,14 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
|
|||
// 1. Remove the first code unit from replacement whitespace, and let element be that
|
||||
// code unit.
|
||||
// FIXME: Find a way to get code points directly from the UTF-8 string
|
||||
auto replacement_whitespace_utf16 = MUST(AK::utf8_to_utf16(replacement_whitespace));
|
||||
auto replacement_whitespace_utf16_view = Utf16View { replacement_whitespace_utf16 };
|
||||
replacement_whitespace = MUST(String::from_utf16({ replacement_whitespace_utf16_view.substring_view(1) }));
|
||||
auto element = replacement_whitespace_utf16_view.code_point_at(0);
|
||||
auto replacement_whitespace_utf16 = Utf16String::from_utf8(replacement_whitespace);
|
||||
replacement_whitespace = MUST(replacement_whitespace_utf16.substring_view(1).to_utf8());
|
||||
auto element = replacement_whitespace_utf16.code_point_at(0);
|
||||
|
||||
// 2. If element is not the same as the start offsetth code unit of start node's data:
|
||||
auto start_node_data = *start_node->text_content();
|
||||
auto start_node_utf16 = MUST(AK::utf8_to_utf16(start_node_data));
|
||||
auto start_node_utf16_view = Utf16View { start_node_utf16 };
|
||||
auto start_node_code_point = start_node_utf16_view.code_point_at(start_offset);
|
||||
auto start_node_data = Utf16String::from_utf8(*start_node->text_content());
|
||||
auto start_node_code_point = start_node_data.code_point_at(start_offset);
|
||||
|
||||
if (element != start_node_code_point) {
|
||||
// 1. Call insertData(start offset, element) on start node.
|
||||
auto& start_node_character_data = static_cast<DOM::CharacterData&>(*start_node);
|
||||
|
|
|
@ -106,11 +106,10 @@ WebIDL::ExceptionOr<FileReader::Result> FileReader::blob_package_data(JS::Realm&
|
|||
return JS::ArrayBuffer::create(realm, move(bytes));
|
||||
case Type::BinaryString:
|
||||
// Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255].
|
||||
Utf16Data builder;
|
||||
builder.ensure_capacity(bytes.size());
|
||||
StringBuilder builder(StringBuilder::Mode::UTF16, bytes.size());
|
||||
for (auto byte : bytes.bytes())
|
||||
builder.unchecked_append(byte);
|
||||
return MUST(Utf16View { builder }.to_utf8());
|
||||
builder.append_code_unit(byte);
|
||||
return MUST(builder.utf16_string_view().to_utf8());
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <AK/FlyString.h>
|
||||
#include <AK/GenericLexer.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibWeb/Infra/CharacterTypes.h>
|
||||
|
@ -63,10 +64,8 @@ ErrorOr<String> strip_and_collapse_whitespace(StringView string)
|
|||
// https://infra.spec.whatwg.org/#code-unit-prefix
|
||||
bool is_code_unit_prefix(StringView potential_prefix_utf8, StringView input_utf8)
|
||||
{
|
||||
auto potential_prefix_utf16_bytes = MUST(utf8_to_utf16(potential_prefix_utf8));
|
||||
auto input_utf16_bytes = MUST(utf8_to_utf16(input_utf8));
|
||||
Utf16View potential_prefix { potential_prefix_utf16_bytes };
|
||||
Utf16View input { input_utf16_bytes };
|
||||
auto potential_prefix = Utf16String::from_utf8(potential_prefix_utf8);
|
||||
auto input = Utf16String::from_utf8(input_utf8);
|
||||
|
||||
// 1. Let i be 0.
|
||||
size_t i = 0;
|
||||
|
@ -148,9 +147,10 @@ bool code_unit_less_than(StringView a, StringView b)
|
|||
if (a.is_ascii() && b.is_ascii())
|
||||
return a < b;
|
||||
|
||||
auto a_utf16 = MUST(utf8_to_utf16(a));
|
||||
auto b_utf16 = MUST(utf8_to_utf16(b));
|
||||
return Utf16View { a_utf16 }.is_code_unit_less_than(Utf16View { b_utf16 });
|
||||
auto a_utf16 = Utf16String::from_utf8(a);
|
||||
auto b_utf16 = Utf16String::from_utf8(b);
|
||||
|
||||
return a_utf16.utf16_view().is_code_unit_less_than(b_utf16);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,17 +50,18 @@ Vector<Viewport::TextBlock> const& Viewport::text_blocks()
|
|||
|
||||
void Viewport::update_text_blocks()
|
||||
{
|
||||
StringBuilder builder;
|
||||
StringBuilder builder(StringBuilder::Mode::UTF16);
|
||||
size_t current_start_position = 0;
|
||||
Vector<TextPosition> text_positions;
|
||||
Vector<TextBlock> text_blocks;
|
||||
|
||||
for_each_in_inclusive_subtree([&](auto const& layout_node) {
|
||||
if (layout_node.display().is_none() || !layout_node.first_paintable() || !layout_node.first_paintable()->is_visible())
|
||||
return TraversalDecision::Continue;
|
||||
|
||||
if (layout_node.is_box() || layout_node.is_generated()) {
|
||||
if (!builder.is_empty()) {
|
||||
text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions });
|
||||
text_blocks.append({ builder.to_utf16_string(), text_positions });
|
||||
current_start_position = 0;
|
||||
text_positions.clear_with_capacity();
|
||||
builder.clear();
|
||||
|
@ -79,10 +80,9 @@ void Viewport::update_text_blocks()
|
|||
text_positions.empend(dom_node, current_start_position);
|
||||
}
|
||||
|
||||
auto const& current_node_text = text_node->text_for_rendering();
|
||||
auto const current_node_text_utf16 = MUST(AK::utf8_to_utf16(current_node_text));
|
||||
current_start_position += current_node_text_utf16.data.size();
|
||||
builder.append(move(current_node_text));
|
||||
auto const& current_node_text = Utf16String::from_utf8(text_node->text_for_rendering());
|
||||
current_start_position += current_node_text.length_in_code_units();
|
||||
builder.append(current_node_text);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,7 +90,7 @@ void Viewport::update_text_blocks()
|
|||
});
|
||||
|
||||
if (!builder.is_empty())
|
||||
text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions });
|
||||
text_blocks.append({ builder.to_utf16_string(), text_positions });
|
||||
|
||||
m_text_blocks = move(text_blocks);
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Utf16String.h>
|
||||
#include <LibWeb/DOM/Document.h>
|
||||
#include <LibWeb/Layout/BlockContainer.h>
|
||||
|
||||
|
@ -24,7 +25,7 @@ public:
|
|||
size_t start_offset { 0 };
|
||||
};
|
||||
struct TextBlock {
|
||||
AK::Utf16ConversionResult text;
|
||||
Utf16String text;
|
||||
Vector<TextPosition> positions;
|
||||
};
|
||||
Vector<TextBlock> const& text_blocks();
|
||||
|
|
|
@ -252,9 +252,8 @@ Utf16View PaintableFragment::utf16_view() const
|
|||
return {};
|
||||
|
||||
if (!m_text_in_utf16.has_value())
|
||||
m_text_in_utf16 = MUST(AK::utf8_to_utf16(utf8_view()));
|
||||
|
||||
return Utf16View { m_text_in_utf16.value() };
|
||||
m_text_in_utf16 = Utf16String::from_utf8(utf8_view().as_string());
|
||||
return *m_text_in_utf16;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Utf16String.h>
|
||||
#include <LibGfx/TextLayout.h>
|
||||
#include <LibWeb/Layout/Node.h>
|
||||
#include <LibWeb/Painting/ShadowData.h>
|
||||
|
@ -64,7 +65,7 @@ private:
|
|||
CSS::WritingMode m_writing_mode;
|
||||
Vector<ShadowData> m_shadows;
|
||||
CSSPixels m_text_decoration_thickness { 0 };
|
||||
mutable Optional<AK::Utf16ConversionResult> m_text_in_utf16;
|
||||
mutable Optional<Utf16String> m_text_in_utf16;
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -48,8 +48,8 @@ ByteString SVGTextContentElement::text_contents() const
|
|||
// https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars
|
||||
WebIDL::ExceptionOr<WebIDL::Long> SVGTextContentElement::get_number_of_chars() const
|
||||
{
|
||||
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data;
|
||||
return static_cast<WebIDL::Long>(chars.size());
|
||||
auto length_in_code_units = AK::utf16_code_unit_length_from_utf8(text_contents());
|
||||
return static_cast<WebIDL::Long>(length_in_code_units);
|
||||
}
|
||||
|
||||
GC::Ref<Geometry::DOMPoint> SVGTextContentElement::get_start_position_of_char(WebIDL::UnsignedLong charnum)
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
TEST_CASE(decode_ascii)
|
||||
{
|
||||
auto string = MUST(AK::utf8_to_utf16("Hello World!11"sv));
|
||||
auto string = Utf16String::from_utf8("Hello World!11"sv);
|
||||
Utf16View view { string };
|
||||
|
||||
size_t valid_code_units = 0;
|
||||
|
@ -34,7 +34,7 @@ TEST_CASE(decode_ascii)
|
|||
|
||||
TEST_CASE(decode_utf8)
|
||||
{
|
||||
auto string = MUST(AK::utf8_to_utf16("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv));
|
||||
auto string = Utf16String::from_utf8("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv);
|
||||
Utf16View view { string };
|
||||
|
||||
size_t valid_code_units = 0;
|
||||
|
@ -55,7 +55,7 @@ TEST_CASE(encode_utf8)
|
|||
{
|
||||
{
|
||||
auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string;
|
||||
auto string = MUST(AK::utf8_to_utf16(utf8_string));
|
||||
auto string = Utf16String::from_utf8(utf8_string);
|
||||
Utf16View view { string };
|
||||
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), utf8_string);
|
||||
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), utf8_string);
|
||||
|
@ -139,7 +139,7 @@ TEST_CASE(utf16_literal)
|
|||
|
||||
TEST_CASE(iterate_utf16)
|
||||
{
|
||||
auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv));
|
||||
auto string = Utf16String::from_utf8("Привет 😀"sv);
|
||||
Utf16View view { string };
|
||||
auto iterator = view.begin();
|
||||
|
||||
|
@ -371,16 +371,16 @@ TEST_CASE(to_ascii_titlecase)
|
|||
|
||||
TEST_CASE(equals_ignoring_case)
|
||||
{
|
||||
auto string1 = MUST(AK::utf8_to_utf16("foobar"sv));
|
||||
auto string2 = MUST(AK::utf8_to_utf16("FooBar"sv));
|
||||
auto string1 = Utf16String::from_utf8("foobar"sv);
|
||||
auto string2 = Utf16String::from_utf8("FooBar"sv);
|
||||
EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
|
||||
|
||||
string1 = MUST(AK::utf8_to_utf16(""sv));
|
||||
string2 = MUST(AK::utf8_to_utf16(""sv));
|
||||
string1 = Utf16String::from_utf8(""sv);
|
||||
string2 = Utf16String::from_utf8(""sv);
|
||||
EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
|
||||
|
||||
string1 = MUST(AK::utf8_to_utf16(""sv));
|
||||
string2 = MUST(AK::utf8_to_utf16("FooBar"sv));
|
||||
string1 = Utf16String::from_utf8(""sv);
|
||||
string2 = Utf16String::from_utf8("FooBar"sv);
|
||||
EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
|
||||
}
|
||||
|
||||
|
@ -425,7 +425,7 @@ TEST_CASE(replace)
|
|||
|
||||
TEST_CASE(substring_view)
|
||||
{
|
||||
auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv));
|
||||
auto string = Utf16String::from_utf8("Привет 😀"sv);
|
||||
{
|
||||
Utf16View view { string };
|
||||
view = view.substring_view(7, 2);
|
||||
|
@ -532,7 +532,7 @@ TEST_CASE(starts_with)
|
|||
|
||||
TEST_CASE(find_code_unit_offset)
|
||||
{
|
||||
auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv));
|
||||
auto conversion_result = Utf16String::from_utf8("😀foo😀bar"sv);
|
||||
Utf16View const view { conversion_result };
|
||||
|
||||
EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value());
|
||||
|
@ -549,7 +549,7 @@ TEST_CASE(find_code_unit_offset)
|
|||
|
||||
TEST_CASE(find_code_unit_offset_ignoring_case)
|
||||
{
|
||||
auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv));
|
||||
auto conversion_result = Utf16String::from_utf8("😀Foo😀Bar"sv);
|
||||
Utf16View const view { conversion_result };
|
||||
|
||||
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value());
|
||||
|
|
|
@ -823,7 +823,7 @@ TEST_CASE(ECMA262_unicode_match)
|
|||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
|
||||
|
||||
auto subject = MUST(AK::utf8_to_utf16(test.subject));
|
||||
auto subject = Utf16String::from_utf8(test.subject);
|
||||
Utf16View view { subject };
|
||||
|
||||
if constexpr (REGEX_DEBUG) {
|
||||
|
@ -956,7 +956,7 @@ TEST_CASE(ECMA262_property_match)
|
|||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
|
||||
|
||||
auto subject = MUST(AK::utf8_to_utf16(test.subject));
|
||||
auto subject = Utf16String::from_utf8(test.subject);
|
||||
Utf16View view { subject };
|
||||
|
||||
if constexpr (REGEX_DEBUG) {
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <AK/Array.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibUnicode/Segmenter.h>
|
||||
|
@ -155,21 +156,21 @@ TEST_CASE(out_of_bounds)
|
|||
EXPECT(!result.has_value());
|
||||
}
|
||||
{
|
||||
auto text = MUST(AK::utf8_to_utf16("foo"sv));
|
||||
auto text = u"foo"_utf16;
|
||||
|
||||
auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word);
|
||||
segmenter->set_segmented_text(Utf16View { text });
|
||||
segmenter->set_segmented_text(text);
|
||||
|
||||
auto result = segmenter->previous_boundary(text.data.size() + 1);
|
||||
auto result = segmenter->previous_boundary(text.length_in_code_units() + 1);
|
||||
EXPECT(result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(text.data.size() + 1);
|
||||
result = segmenter->next_boundary(text.length_in_code_units() + 1);
|
||||
EXPECT(!result.has_value());
|
||||
|
||||
result = segmenter->previous_boundary(text.data.size());
|
||||
result = segmenter->previous_boundary(text.length_in_code_units());
|
||||
EXPECT(result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(text.data.size());
|
||||
result = segmenter->next_boundary(text.length_in_code_units());
|
||||
EXPECT(!result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(0);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue