AK+LibJS: Extract some UTF-16 helpers for use in an outside class

An upcoming Utf16String will need access to these helpers. Let's make
them publicly available.
This commit is contained in:
Timothy Flynn 2025-06-12 17:39:05 -04:00 committed by Tim Flynn
commit 66006d3812
Notes: github-actions[bot] 2025-07-03 13:54:12 +00:00
10 changed files with 121 additions and 86 deletions

View file

@ -9,13 +9,14 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "Parser.h"
#include <AK/Array.h>
#include <AK/CharacterTypes.h>
#include <AK/HashTable.h>
#include <AK/ScopeGuard.h>
#include <AK/StdLibExtras.h>
#include <AK/TemporaryChange.h>
#include <AK/UnicodeUtils.h>
#include <LibJS/Parser.h>
#include <LibJS/Runtime/RegExpObject.h>
#include <LibRegex/Regex.h>
@ -4601,7 +4602,7 @@ FlyString Parser::consume_string_value()
Utf8View view { value.bytes_as_string_view().substring_view(value.bytes().size() - 3) };
VERIFY(view.length() <= 3);
auto codepoint = *view.begin();
if (Utf16View::is_high_surrogate(codepoint)) {
if (AK::UnicodeUtils::is_utf16_high_surrogate(codepoint)) {
syntax_error("StringValue ending with unpaired high surrogate"_string);
VERIFY(view.length() == 1);
}

View file

@ -8,6 +8,7 @@
#include <AK/CharacterTypes.h>
#include <AK/FlyString.h>
#include <AK/StringBuilder.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
#include <LibJS/Runtime/AbstractOperations.h>
@ -308,7 +309,7 @@ void RopeString::resolve(EncodingPreference preference) const
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
if (!AK::UnicodeUtils::is_utf16_high_surrogate(high_surrogate) || !AK::UnicodeUtils::is_utf16_low_surrogate(low_surrogate)) {
builder.append(current_string_as_utf8);
previous = current;
continue;
@ -316,7 +317,7 @@ void RopeString::resolve(EncodingPreference preference) const
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
builder.trim(3);
builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
builder.append_code_point(AK::UnicodeUtils::decode_utf16_surrogate_pair(high_surrogate, low_surrogate));
// Append the remaining part of the current string.
builder.append(current_string_as_utf8.substring_view(3));

View file

@ -5,8 +5,8 @@
*/
#include <AK/StringBuilder.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/Array.h>
#include <LibJS/Runtime/Error.h>
@ -129,7 +129,9 @@ JS_DEFINE_NATIVE_FUNCTION(StringConstructor::from_code_point)
return vm.throw_completion<RangeError>(ErrorType::InvalidCodePoint, next_code_point.to_string_without_side_effects());
// d. Set result to the string-concatenation of result and UTF16EncodeCodePoint((nextCP)).
MUST(code_point_to_utf16(string, static_cast<u32>(code_point)));
(void)AK::UnicodeUtils::code_point_to_utf16(static_cast<u32>(code_point), [&](auto code_unit) {
string.append(code_unit);
});
}
// 3. Assert: If codePoints is empty, then result is the empty String.

View file

@ -8,6 +8,7 @@
#include <AK/Checked.h>
#include <AK/Function.h>
#include <AK/StringBuilder.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16View.h>
#include <LibGC/Heap.h>
#include <LibJS/Runtime/AbstractOperations.h>
@ -121,7 +122,7 @@ CodePoint code_point_at(Utf16View const& string, size_t position)
}
// 6. If first is a trailing surrogate or position + 1 = size, then
if (Utf16View::is_low_surrogate(first) || (position + 1 == string.length_in_code_units())) {
if (AK::UnicodeUtils::is_utf16_low_surrogate(first) || (position + 1 == string.length_in_code_units())) {
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return { true, code_point, 1 };
}
@ -130,13 +131,13 @@ CodePoint code_point_at(Utf16View const& string, size_t position)
auto second = string.code_unit_at(position + 1);
// 8. If second is not a trailing surrogate, then
if (!Utf16View::is_low_surrogate(second)) {
if (!AK::UnicodeUtils::is_utf16_low_surrogate(second)) {
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return { true, code_point, 1 };
}
// 9. Set cp to UTF16SurrogatePairToCodePoint(first, second).
code_point = Utf16View::decode_surrogate_pair(first, second);
code_point = AK::UnicodeUtils::decode_utf16_surrogate_pair(first, second);
// 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
return { false, code_point, 2 };