AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.
This commit is contained in:
Ali Mohammad Pur 2025-04-02 17:56:49 +02:00 committed by Andrew Kaster
commit eea81738cd
Notes: github-actions[bot] 2025-04-23 13:57:06 +00:00
11 changed files with 74 additions and 37 deletions

View file

@ -21,9 +21,13 @@ namespace AK {
using Utf16Data = Vector<u16, 1>;
ErrorOr<Utf16Data> utf8_to_utf16(StringView, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
struct Utf16ConversionResult {
Utf16Data data;
size_t code_point_count;
};
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView, Endianness = Endianness::Host);
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
@ -77,6 +81,13 @@ public:
{
}
Utf16View(Utf16ConversionResult&&) = delete;
explicit Utf16View(Utf16ConversionResult const& conversion_result)
: m_code_units(conversion_result.data)
, m_length_in_code_points(conversion_result.code_point_count)
{
}
template<size_t Size>
Utf16View(char16_t const (&code_units)[Size])
: m_code_units(
@ -95,6 +106,8 @@ public:
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
bool is_null() const { return m_code_units.is_null(); }
bool is_empty() const { return m_code_units.is_empty(); }
size_t length_in_code_units() const { return m_code_units.size(); }