AK+LibJS+LibWeb+LibRegex: Replace AK::Utf16Data with AK::Utf16String

This commit is contained in:
Timothy Flynn 2025-07-09 14:13:38 -04:00 committed by Tim Flynn
commit 9582895759
Notes: github-actions[bot] 2025-07-18 16:46:53 +00:00
22 changed files with 101 additions and 222 deletions

View file

@ -67,11 +67,6 @@ ErrorOr<String> String::from_utf8(StringView view)
return result;
}
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{
return utf16.to_utf8();
}
ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes)
{
if (bytes.is_empty())
@ -80,7 +75,7 @@ ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes b
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
auto utf16_length = bytes.size() / 2;
Utf16Data well_formed_utf16;
Vector<char16_t> well_formed_utf16;
if (!validate_utf16_le(bytes)) {
well_formed_utf16.resize(bytes.size());
@ -109,7 +104,7 @@ ErrorOr<String> String::from_utf16_be_with_replacement_character(ReadonlyBytes b
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
auto utf16_length = bytes.size() / 2;
Utf16Data well_formed_utf16;
Vector<char16_t> well_formed_utf16;
if (!validate_utf16_le(bytes)) {
well_formed_utf16.resize(bytes.size());

View file

@ -69,7 +69,6 @@ public:
[[nodiscard]] static String from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder&);
// Creates a new String from a sequence of UTF-16 encoded code points.
static ErrorOr<String> from_utf16(Utf16View const&);
static ErrorOr<String> from_utf16_le_with_replacement_character(ReadonlyBytes);
static ErrorOr<String> from_utf16_be_with_replacement_character(ReadonlyBytes);

View file

@ -10,77 +10,12 @@
#include <AK/StringView.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <simdutf.h>
namespace AK {
template<OneOf<Utf8View, Utf32View> UtfViewType>
static ErrorOr<Utf16ConversionResult> to_utf16_slow(UtfViewType const& view)
{
Utf16Data utf16_data;
TRY(utf16_data.try_ensure_capacity(view.length()));
size_t code_point_count = 0;
for (auto code_point : view) {
TRY(UnicodeUtils::try_code_point_to_utf16(code_point, [&](auto code_unit) -> ErrorOr<void> {
TRY(utf16_data.try_append(code_unit));
return {};
}));
code_point_count++;
}
return Utf16ConversionResult { move(utf16_data), code_point_count };
}
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView utf8_view)
{
return utf8_to_utf16(Utf8View { utf8_view });
}
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const& utf8_view)
{
if (utf8_view.is_empty())
return Utf16ConversionResult { Utf16Data {}, 0 };
// All callers want to allow lonely surrogates, which simdutf does not permit.
if (!utf8_view.validate(AllowLonelySurrogates::No)) [[unlikely]]
return to_utf16_slow(utf8_view);
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
auto length = utf8_view.byte_length();
Utf16Data utf16_data;
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length)));
// FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again.
auto code_point_length = simdutf::count_utf8(data, length);
[[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(data, length, reinterpret_cast<char16_t*>(utf16_data.data()));
ASSERT(result == utf16_data.size());
return Utf16ConversionResult { utf16_data, code_point_length };
}
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
{
if (utf32_view.is_empty())
return Utf16ConversionResult { Utf16Data {}, 0 };
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
auto length = utf32_view.length();
Utf16Data utf16_data;
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(data, length)));
[[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(data, length, reinterpret_cast<char16_t*>(utf16_data.data()));
ASSERT(result == utf16_data.size());
return Utf16ConversionResult { utf16_data, length };
}
bool validate_utf16_le(ReadonlyBytes bytes)
{
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);

View file

@ -23,16 +23,6 @@
namespace AK {
using Utf16Data = Vector<char16_t, 1>;
struct Utf16ConversionResult {
Utf16Data data;
size_t code_point_count;
};
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView);
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&);
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
@ -156,13 +146,6 @@ public:
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
constexpr Utf16View(Utf16Data const& string)
: m_string { .utf16 = string.data() }
, m_length_in_code_units(string.size())
{
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
consteval Utf16View(StringView string)
: m_string { .ascii = string.characters_without_null_termination() }
, m_length_in_code_units(string.length())
@ -170,15 +153,6 @@ public:
VERIFY(all_of(string, AK::is_ascii));
}
Utf16View(Utf16ConversionResult&&) = delete;
explicit Utf16View(Utf16ConversionResult const& conversion_result)
: m_string { .utf16 = conversion_result.data.data() }
, m_length_in_code_units(conversion_result.data.size())
, m_length_in_code_points(conversion_result.code_point_count)
{
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
ErrorOr<String> to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
ErrorOr<ByteString> to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
@ -314,18 +288,6 @@ public:
return m_length_in_code_points;
}
constexpr Optional<size_t> length_in_code_points_if_known() const
{
if (has_ascii_storage())
return m_length_in_code_units;
if (m_length_in_code_points == NumericLimits<size_t>::max())
return {};
return m_length_in_code_points;
}
constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
{
VERIFY(index < length_in_code_units());
@ -591,6 +553,5 @@ inline constexpr bool IsHashCompatible<Utf16String, Utf16View> = true;
}
#if USING_AK_GLOBALLY
using AK::Utf16Data;
using AK::Utf16View;
#endif

View file

@ -111,7 +111,7 @@ ErrorOr<String> Process::get_name()
if (!length)
return Error::from_windows_error();
return String::from_utf16(Utf16View { reinterpret_cast<char16_t const*>(path), length });
return MUST(Utf16View { reinterpret_cast<char16_t const*>(path), length }.to_utf8());
}
ErrorOr<void> Process::set_name(StringView, SetThreadName)

View file

@ -559,7 +559,7 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::encode_uri_component)
JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
{
// 1. Set string to ? ToString(string).
auto string = TRY(vm.argument(0).to_byte_string(vm));
auto string = TRY(vm.argument(0).to_utf16_string(vm));
// 3. Let R be the empty String.
StringBuilder escaped;
@ -570,29 +570,29 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
// 2. Let length be the length of string.
// 5. Let k be 0.
// 6. Repeat, while k < length,
auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string));
for (auto code_point : utf16_conversion.data) {
for (size_t k = 0; k < string.length_in_code_units(); ++k) {
// a. Let char be the code unit at index k within string.
auto code_unit = string.code_unit_at(k);
// b. If unescapedSet contains char, then
// NOTE: We know unescapedSet is ASCII-only, so ensure we have an ASCII codepoint before casting to char.
if (is_ascii(code_point) && unescaped_set.contains(static_cast<char>(code_point))) {
if (is_ascii(code_unit) && unescaped_set.contains(static_cast<char>(code_unit))) {
// i. Let S be the String value containing the single code unit char.
escaped.append(code_point);
escaped.append(static_cast<char>(code_unit));
}
// c. Else,
// i. Let n be the numeric value of char.
// ii. If n < 256, then
else if (code_point < 256) {
else if (code_unit < 256) {
// 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number.
// 2. Let S be the string-concatenation of "%" and ! StringPad(hex, 2𝔽, "0", start).
escaped.appendff("%{:02X}", code_point);
escaped.appendff("%{:02X}", code_unit);
}
// iii. Else,
else {
// 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number.
// 2. Let S be the string-concatenation of "%u" and ! StringPad(hex, 4𝔽, "0", start).
escaped.appendff("%u{:04X}", code_point);
escaped.appendff("%u{:04X}", code_unit);
}
// d. Set R to the string-concatenation of R and S.

View file

@ -93,26 +93,21 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
if (unicode && unicode_sets)
return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) };
auto utf16_pattern_result = AK::utf8_to_utf16(pattern);
if (utf16_pattern_result.is_error())
return ParseRegexPatternError { "Out of memory"_string };
auto utf16_result = utf16_pattern_result.release_value();
Utf16View utf16_pattern_view { utf16_result };
auto utf16_pattern = Utf16String::from_utf8(pattern);
StringBuilder builder;
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
auto previous_code_unit_was_backslash = false;
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
for (size_t i = 0; i < utf16_pattern.length_in_code_units();) {
if (unicode || unicode_sets) {
auto code_point = code_point_at(utf16_pattern_view, i);
auto code_point = code_point_at(utf16_pattern, i);
builder.append_code_point(code_point.code_point);
i += code_point.code_unit_count;
continue;
}
u16 code_unit = utf16_pattern_view.code_unit_at(i);
u16 code_unit = utf16_pattern.code_unit_at(i);
++i;
if (code_unit > 0x7f) {

View file

@ -512,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
return ExecutionResult::Failed_ExecuteLowPrioForks;
Optional<ByteString> str;
Utf16Data utf16;
Utf16String utf16;
Vector<u32> data;
data.ensure_capacity(length);
for (size_t i = offset; i < offset + length; ++i)

View file

@ -8,14 +8,15 @@
#include "Forward.h"
#include "RegexOptions.h"
#include <AK/Error.h>
#include <AK/ByteString.h>
#include <AK/COWVector.h>
#include <AK/Error.h>
#include <AK/FlyString.h>
#include <AK/MemMem.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
@ -110,7 +111,7 @@ public:
return view;
}
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16String& optional_utf16_storage) const
{
auto view = m_view.visit(
[&optional_string_storage, data]<typename T>(T const&) {
@ -121,11 +122,8 @@ public:
return RegexStringView { T { *optional_string_storage } };
},
[&optional_utf16_storage, data](Utf16View) {
auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
optional_utf16_storage = conversion_result.data;
auto view = Utf16View { optional_utf16_storage };
view.unsafe_set_code_point_length(conversion_result.code_point_count);
return RegexStringView { view };
optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() });
return RegexStringView { optional_utf16_storage.utf16_view() };
});
view.set_unicode(unicode());

View file

@ -46,9 +46,8 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
{
// 1. Let length be nodes length.
// FIXME: This is very inefficient!
auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_result };
auto length = utf16_view.length_in_code_units();
auto utf16_string = Utf16String::from_utf8(m_data);
auto length = utf16_string.length_in_code_units();
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
if (offset > length)
@ -57,10 +56,10 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
// 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
// to the end of nodes data, and then return.
if (offset + count > length)
return MUST(utf16_view.substring_view(offset).to_utf8());
return MUST(utf16_string.substring_view(offset).to_utf8());
// 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in nodes data.
return MUST(utf16_view.substring_view(offset, count).to_utf8());
return MUST(utf16_string.substring_view(offset, count).to_utf8());
}
// https://dom.spec.whatwg.org/#concept-cd-replace
@ -68,9 +67,8 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
{
// 1. Let length be nodes length.
// FIXME: This is very inefficient!
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_data };
auto length = utf16_view.length_in_code_units();
auto utf16_string = Utf16String::from_utf8(m_data);
auto length = utf16_string.length_in_code_units();
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
if (offset > length)
@ -83,17 +81,17 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
// 5. Insert data into nodes data after offset code units.
// 6. Let delete offset be offset + datas length.
// 7. Starting from delete offset code units, remove count code units from nodes data.
auto before_data = utf16_view.substring_view(0, offset);
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
auto after_data = utf16_view.substring_view(offset + count);
auto before_data = utf16_string.substring_view(0, offset);
auto inserted_data = Utf16String::from_utf8(data);
auto after_data = utf16_string.substring_view(offset + count);
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data.length_in_code_units() + after_data.length_in_code_units());
full_data.append(before_data);
full_data.append(inserted_data_result.data);
full_data.append(inserted_data);
full_data.append(after_data);
auto full_view = full_data.utf16_string_view();
bool characters_are_the_same = utf16_view == full_view;
auto full_view = full_data.utf16_string_view();
bool characters_are_the_same = utf16_string == full_view;
auto old_data = m_data;
// OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
@ -123,14 +121,14 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
// start offset by datas length and decrease it by count.
for (auto* range : Range::live_ranges()) {
if (range->start_container() == this && range->start_offset() > (offset + count))
range->set_start_offset(range->start_offset() + inserted_data_result.data.size() - count);
range->set_start_offset(range->start_offset() + inserted_data.length_in_code_units() - count);
}
// 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end
// offset by datas length and decrease it by count.
for (auto* range : Range::live_ranges()) {
if (range->end_container() == this && range->end_offset() > (offset + count))
range->set_end_offset(range->end_offset() + inserted_data_result.data.size() - count);
range->set_end_offset(range->end_offset() + inserted_data.length_in_code_units() - count);
}
// 12. If nodes parent is non-null, then run the children changed steps for nodes parent.

View file

@ -6158,8 +6158,7 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
if (text_blocks.is_empty())
return {};
auto utf16_query = MUST(AK::utf8_to_utf16(query));
Utf16View query_view { utf16_query };
auto utf16_query = Utf16String::from_utf8(query);
Vector<GC::Root<Range>> matches;
for (auto const& text_block : text_blocks) {
@ -6169,8 +6168,8 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
auto* match_start_position = text_block.positions.data();
while (true) {
auto match_index = case_sensitivity == CaseSensitivity::CaseInsensitive
? text_view.find_code_unit_offset_ignoring_case(query_view, offset)
: text_view.find_code_unit_offset(query_view, offset);
? text_view.find_code_unit_offset_ignoring_case(utf16_query, offset)
: text_view.find_code_unit_offset(utf16_query, offset);
if (!match_index.has_value())
break;
@ -6181,15 +6180,15 @@ Vector<GC::Root<Range>> Document::find_matching_text(String const& query, CaseSe
auto& start_dom_node = match_start_position->dom_node;
auto* match_end_position = match_start_position;
for (; i < text_block.positions.size() - 1 && (match_index.value() + query_view.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i)
for (; i < text_block.positions.size() - 1 && (match_index.value() + utf16_query.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i)
match_end_position = &text_block.positions[i + 1];
auto& end_dom_node = match_end_position->dom_node;
auto end_position = match_index.value() + query_view.length_in_code_units() - match_end_position->start_offset;
auto end_position = match_index.value() + utf16_query.length_in_code_units() - match_end_position->start_offset;
matches.append(Range::create(start_dom_node, start_position, end_dom_node, end_position));
match_start_position = match_end_position;
offset = match_index.value() + query_view.length_in_code_units() + 1;
offset = match_index.value() + utf16_query.length_in_code_units() + 1;
if (offset >= text_view.length_in_code_units())
break;
}

View file

@ -384,9 +384,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
auto parent_white_space_collapse = resolved_keyword(*start_node->parent(), CSS::PropertyID::WhiteSpaceCollapse);
// FIXME: Find a way to get code points directly from the UTF-8 string
auto start_node_data = *start_node->text_content();
auto utf16_code_units = MUST(AK::utf8_to_utf16(start_node_data));
auto offset_minus_one_code_point = Utf16View { utf16_code_units }.code_point_at(start_offset - 1);
auto start_node_data = Utf16String::from_utf8(*start_node->text_content());
auto offset_minus_one_code_point = start_node_data.code_point_at(start_offset - 1);
if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_minus_one_code_point == 0x20 || offset_minus_one_code_point == 0xA0)) {
--start_offset;
continue;
@ -437,9 +437,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
auto parent_white_space_collapse = resolved_keyword(*end_node->parent(), CSS::PropertyID::WhiteSpaceCollapse);
// FIXME: Find a way to get code points directly from the UTF-8 string
auto end_node_data = *end_node->text_content();
auto utf16_code_units = MUST(AK::utf8_to_utf16(end_node_data));
auto offset_code_point = Utf16View { utf16_code_units }.code_point_at(end_offset);
auto end_node_data = Utf16String::from_utf8(*end_node->text_content());
auto offset_code_point = end_node_data.code_point_at(end_offset);
if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_code_point == 0x20 || offset_code_point == 0xA0)) {
// 1. If fix collapsed space is true, and collapse spaces is true, and the end offsetth
// code unit of end node's data is a space (0x0020): call deleteData(end offset, 1)
@ -556,16 +556,14 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa
// 1. Remove the first code unit from replacement whitespace, and let element be that
// code unit.
// FIXME: Find a way to get code points directly from the UTF-8 string
auto replacement_whitespace_utf16 = MUST(AK::utf8_to_utf16(replacement_whitespace));
auto replacement_whitespace_utf16_view = Utf16View { replacement_whitespace_utf16 };
replacement_whitespace = MUST(String::from_utf16({ replacement_whitespace_utf16_view.substring_view(1) }));
auto element = replacement_whitespace_utf16_view.code_point_at(0);
auto replacement_whitespace_utf16 = Utf16String::from_utf8(replacement_whitespace);
replacement_whitespace = MUST(replacement_whitespace_utf16.substring_view(1).to_utf8());
auto element = replacement_whitespace_utf16.code_point_at(0);
// 2. If element is not the same as the start offsetth code unit of start node's data:
auto start_node_data = *start_node->text_content();
auto start_node_utf16 = MUST(AK::utf8_to_utf16(start_node_data));
auto start_node_utf16_view = Utf16View { start_node_utf16 };
auto start_node_code_point = start_node_utf16_view.code_point_at(start_offset);
auto start_node_data = Utf16String::from_utf8(*start_node->text_content());
auto start_node_code_point = start_node_data.code_point_at(start_offset);
if (element != start_node_code_point) {
// 1. Call insertData(start offset, element) on start node.
auto& start_node_character_data = static_cast<DOM::CharacterData&>(*start_node);

View file

@ -106,11 +106,10 @@ WebIDL::ExceptionOr<FileReader::Result> FileReader::blob_package_data(JS::Realm&
return JS::ArrayBuffer::create(realm, move(bytes));
case Type::BinaryString:
// Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255].
Utf16Data builder;
builder.ensure_capacity(bytes.size());
StringBuilder builder(StringBuilder::Mode::UTF16, bytes.size());
for (auto byte : bytes.bytes())
builder.unchecked_append(byte);
return MUST(Utf16View { builder }.to_utf8());
builder.append_code_unit(byte);
return MUST(builder.utf16_string_view().to_utf8());
}
VERIFY_NOT_REACHED();
}

View file

@ -12,6 +12,7 @@
#include <AK/FlyString.h>
#include <AK/GenericLexer.h>
#include <AK/String.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
#include <LibWeb/Infra/CharacterTypes.h>
@ -63,10 +64,8 @@ ErrorOr<String> strip_and_collapse_whitespace(StringView string)
// https://infra.spec.whatwg.org/#code-unit-prefix
bool is_code_unit_prefix(StringView potential_prefix_utf8, StringView input_utf8)
{
auto potential_prefix_utf16_bytes = MUST(utf8_to_utf16(potential_prefix_utf8));
auto input_utf16_bytes = MUST(utf8_to_utf16(input_utf8));
Utf16View potential_prefix { potential_prefix_utf16_bytes };
Utf16View input { input_utf16_bytes };
auto potential_prefix = Utf16String::from_utf8(potential_prefix_utf8);
auto input = Utf16String::from_utf8(input_utf8);
// 1. Let i be 0.
size_t i = 0;
@ -148,9 +147,10 @@ bool code_unit_less_than(StringView a, StringView b)
if (a.is_ascii() && b.is_ascii())
return a < b;
auto a_utf16 = MUST(utf8_to_utf16(a));
auto b_utf16 = MUST(utf8_to_utf16(b));
return Utf16View { a_utf16 }.is_code_unit_less_than(Utf16View { b_utf16 });
auto a_utf16 = Utf16String::from_utf8(a);
auto b_utf16 = Utf16String::from_utf8(b);
return a_utf16.utf16_view().is_code_unit_less_than(b_utf16);
}
}

View file

@ -50,17 +50,18 @@ Vector<Viewport::TextBlock> const& Viewport::text_blocks()
void Viewport::update_text_blocks()
{
StringBuilder builder;
StringBuilder builder(StringBuilder::Mode::UTF16);
size_t current_start_position = 0;
Vector<TextPosition> text_positions;
Vector<TextBlock> text_blocks;
for_each_in_inclusive_subtree([&](auto const& layout_node) {
if (layout_node.display().is_none() || !layout_node.first_paintable() || !layout_node.first_paintable()->is_visible())
return TraversalDecision::Continue;
if (layout_node.is_box() || layout_node.is_generated()) {
if (!builder.is_empty()) {
text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions });
text_blocks.append({ builder.to_utf16_string(), text_positions });
current_start_position = 0;
text_positions.clear_with_capacity();
builder.clear();
@ -79,10 +80,9 @@ void Viewport::update_text_blocks()
text_positions.empend(dom_node, current_start_position);
}
auto const& current_node_text = text_node->text_for_rendering();
auto const current_node_text_utf16 = MUST(AK::utf8_to_utf16(current_node_text));
current_start_position += current_node_text_utf16.data.size();
builder.append(move(current_node_text));
auto const& current_node_text = Utf16String::from_utf8(text_node->text_for_rendering());
current_start_position += current_node_text.length_in_code_units();
builder.append(current_node_text);
}
}
@ -90,7 +90,7 @@ void Viewport::update_text_blocks()
});
if (!builder.is_empty())
text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions });
text_blocks.append({ builder.to_utf16_string(), text_positions });
m_text_blocks = move(text_blocks);
}

View file

@ -6,6 +6,7 @@
#pragma once
#include <AK/Utf16String.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/Layout/BlockContainer.h>
@ -24,7 +25,7 @@ public:
size_t start_offset { 0 };
};
struct TextBlock {
AK::Utf16ConversionResult text;
Utf16String text;
Vector<TextPosition> positions;
};
Vector<TextBlock> const& text_blocks();

View file

@ -252,9 +252,8 @@ Utf16View PaintableFragment::utf16_view() const
return {};
if (!m_text_in_utf16.has_value())
m_text_in_utf16 = MUST(AK::utf8_to_utf16(utf8_view()));
return Utf16View { m_text_in_utf16.value() };
m_text_in_utf16 = Utf16String::from_utf8(utf8_view().as_string());
return *m_text_in_utf16;
}
}

View file

@ -6,6 +6,7 @@
#pragma once
#include <AK/Utf16String.h>
#include <LibGfx/TextLayout.h>
#include <LibWeb/Layout/Node.h>
#include <LibWeb/Painting/ShadowData.h>
@ -64,7 +65,7 @@ private:
CSS::WritingMode m_writing_mode;
Vector<ShadowData> m_shadows;
CSSPixels m_text_decoration_thickness { 0 };
mutable Optional<AK::Utf16ConversionResult> m_text_in_utf16;
mutable Optional<Utf16String> m_text_in_utf16;
};
}

View file

@ -48,8 +48,8 @@ ByteString SVGTextContentElement::text_contents() const
// https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars
WebIDL::ExceptionOr<WebIDL::Long> SVGTextContentElement::get_number_of_chars() const
{
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data;
return static_cast<WebIDL::Long>(chars.size());
auto length_in_code_units = AK::utf16_code_unit_length_from_utf8(text_contents());
return static_cast<WebIDL::Long>(length_in_code_units);
}
GC::Ref<Geometry::DOMPoint> SVGTextContentElement::get_start_position_of_char(WebIDL::UnsignedLong charnum)

View file

@ -15,7 +15,7 @@
TEST_CASE(decode_ascii)
{
auto string = MUST(AK::utf8_to_utf16("Hello World!11"sv));
auto string = Utf16String::from_utf8("Hello World!11"sv);
Utf16View view { string };
size_t valid_code_units = 0;
@ -34,7 +34,7 @@ TEST_CASE(decode_ascii)
TEST_CASE(decode_utf8)
{
auto string = MUST(AK::utf8_to_utf16("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv));
auto string = Utf16String::from_utf8("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv);
Utf16View view { string };
size_t valid_code_units = 0;
@ -55,7 +55,7 @@ TEST_CASE(encode_utf8)
{
{
auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string;
auto string = MUST(AK::utf8_to_utf16(utf8_string));
auto string = Utf16String::from_utf8(utf8_string);
Utf16View view { string };
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), utf8_string);
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), utf8_string);
@ -139,7 +139,7 @@ TEST_CASE(utf16_literal)
TEST_CASE(iterate_utf16)
{
auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv));
auto string = Utf16String::from_utf8("Привет 😀"sv);
Utf16View view { string };
auto iterator = view.begin();
@ -371,16 +371,16 @@ TEST_CASE(to_ascii_titlecase)
TEST_CASE(equals_ignoring_case)
{
auto string1 = MUST(AK::utf8_to_utf16("foobar"sv));
auto string2 = MUST(AK::utf8_to_utf16("FooBar"sv));
auto string1 = Utf16String::from_utf8("foobar"sv);
auto string2 = Utf16String::from_utf8("FooBar"sv);
EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
string1 = MUST(AK::utf8_to_utf16(""sv));
string2 = MUST(AK::utf8_to_utf16(""sv));
string1 = Utf16String::from_utf8(""sv);
string2 = Utf16String::from_utf8(""sv);
EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
string1 = MUST(AK::utf8_to_utf16(""sv));
string2 = MUST(AK::utf8_to_utf16("FooBar"sv));
string1 = Utf16String::from_utf8(""sv);
string2 = Utf16String::from_utf8("FooBar"sv);
EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
}
@ -425,7 +425,7 @@ TEST_CASE(replace)
TEST_CASE(substring_view)
{
auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv));
auto string = Utf16String::from_utf8("Привет 😀"sv);
{
Utf16View view { string };
view = view.substring_view(7, 2);
@ -532,7 +532,7 @@ TEST_CASE(starts_with)
TEST_CASE(find_code_unit_offset)
{
auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv));
auto conversion_result = Utf16String::from_utf8("😀foo😀bar"sv);
Utf16View const view { conversion_result };
EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value());
@ -549,7 +549,7 @@ TEST_CASE(find_code_unit_offset)
TEST_CASE(find_code_unit_offset_ignoring_case)
{
auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv));
auto conversion_result = Utf16String::from_utf8("😀Foo😀Bar"sv);
Utf16View const view { conversion_result };
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value());

View file

@ -823,7 +823,7 @@ TEST_CASE(ECMA262_unicode_match)
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
auto subject = MUST(AK::utf8_to_utf16(test.subject));
auto subject = Utf16String::from_utf8(test.subject);
Utf16View view { subject };
if constexpr (REGEX_DEBUG) {
@ -956,7 +956,7 @@ TEST_CASE(ECMA262_property_match)
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
auto subject = MUST(AK::utf8_to_utf16(test.subject));
auto subject = Utf16String::from_utf8(test.subject);
Utf16View view { subject };
if constexpr (REGEX_DEBUG) {

View file

@ -9,6 +9,7 @@
#include <AK/Array.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <AK/Vector.h>
#include <LibUnicode/Segmenter.h>
@ -155,21 +156,21 @@ TEST_CASE(out_of_bounds)
EXPECT(!result.has_value());
}
{
auto text = MUST(AK::utf8_to_utf16("foo"sv));
auto text = u"foo"_utf16;
auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word);
segmenter->set_segmented_text(Utf16View { text });
segmenter->set_segmented_text(text);
auto result = segmenter->previous_boundary(text.data.size() + 1);
auto result = segmenter->previous_boundary(text.length_in_code_units() + 1);
EXPECT(result.has_value());
result = segmenter->next_boundary(text.data.size() + 1);
result = segmenter->next_boundary(text.length_in_code_units() + 1);
EXPECT(!result.has_value());
result = segmenter->previous_boundary(text.data.size());
result = segmenter->previous_boundary(text.length_in_code_units());
EXPECT(result.has_value());
result = segmenter->next_boundary(text.data.size());
result = segmenter->next_boundary(text.length_in_code_units());
EXPECT(!result.has_value());
result = segmenter->next_boundary(0);