LibJS: Port RegExp flags and patterns to UTF-16

This commit is contained in:
Timothy Flynn 2025-08-06 11:28:18 -04:00 committed by Tim Flynn
commit 62d85dd90a
Notes: github-actions[bot] 2025-08-13 13:57:14 +00:00
8 changed files with 55 additions and 46 deletions

View file

@ -1340,7 +1340,7 @@ private:
class RegExpLiteral final : public Expression { class RegExpLiteral final : public Expression {
public: public:
RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions<ECMAScriptFlags> parsed_flags, String pattern, String flags) RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions<ECMAScriptFlags> parsed_flags, Utf16String pattern, Utf16String flags)
: Expression(move(source_range)) : Expression(move(source_range))
, m_parsed_regex(move(parsed_regex)) , m_parsed_regex(move(parsed_regex))
, m_parsed_pattern(move(parsed_pattern)) , m_parsed_pattern(move(parsed_pattern))
@ -1356,15 +1356,15 @@ public:
regex::Parser::Result const& parsed_regex() const { return m_parsed_regex; } regex::Parser::Result const& parsed_regex() const { return m_parsed_regex; }
String const& parsed_pattern() const { return m_parsed_pattern; } String const& parsed_pattern() const { return m_parsed_pattern; }
regex::RegexOptions<ECMAScriptFlags> const& parsed_flags() const { return m_parsed_flags; } regex::RegexOptions<ECMAScriptFlags> const& parsed_flags() const { return m_parsed_flags; }
String const& pattern() const { return m_pattern; } Utf16String const& pattern() const { return m_pattern; }
String const& flags() const { return m_flags; } Utf16String const& flags() const { return m_flags; }
private: private:
regex::Parser::Result m_parsed_regex; regex::Parser::Result m_parsed_regex;
String m_parsed_pattern; String m_parsed_pattern;
regex::RegexOptions<ECMAScriptFlags> m_parsed_flags; regex::RegexOptions<ECMAScriptFlags> m_parsed_flags;
String m_pattern; Utf16String m_pattern;
String m_flags; Utf16String m_flags;
}; };
class PrivateIdentifier final : public Expression { class PrivateIdentifier final : public Expression {

View file

@ -444,8 +444,8 @@ Bytecode::CodeGenerationErrorOr<Optional<ScopedOperand>> StringLiteral::generate
Bytecode::CodeGenerationErrorOr<Optional<ScopedOperand>> RegExpLiteral::generate_bytecode(Bytecode::Generator& generator, Optional<ScopedOperand> preferred_dst) const Bytecode::CodeGenerationErrorOr<Optional<ScopedOperand>> RegExpLiteral::generate_bytecode(Bytecode::Generator& generator, Optional<ScopedOperand> preferred_dst) const
{ {
Bytecode::Generator::SourceLocationScope scope(generator, *this); Bytecode::Generator::SourceLocationScope scope(generator, *this);
auto source_index = generator.intern_string(m_pattern); auto source_index = generator.intern_string(m_pattern.to_utf8_but_should_be_ported_to_utf16());
auto flags_index = generator.intern_string(m_flags); auto flags_index = generator.intern_string(m_flags.to_utf8_but_should_be_ported_to_utf16());
auto regex_index = generator.intern_regex(Bytecode::ParsedRegex { auto regex_index = generator.intern_regex(Bytecode::ParsedRegex {
.regex = m_parsed_regex, .regex = m_parsed_regex,
.pattern = m_parsed_pattern, .pattern = m_parsed_pattern,

View file

@ -1542,7 +1542,7 @@ inline ThrowCompletionOr<CalleeAndThis> get_callee_and_this_from_environment(Byt
} }
// 13.2.7.3 Runtime Semantics: Evaluation, https://tc39.es/ecma262/#sec-regular-expression-literals-runtime-semantics-evaluation // 13.2.7.3 Runtime Semantics: Evaluation, https://tc39.es/ecma262/#sec-regular-expression-literals-runtime-semantics-evaluation
inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, String const& pattern, String const& flags) inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, Utf16String pattern, Utf16String flags)
{ {
// 1. Let pattern be CodePointsToString(BodyText of RegularExpressionLiteral). // 1. Let pattern be CodePointsToString(BodyText of RegularExpressionLiteral).
// 2. Let flags be CodePointsToString(FlagText of RegularExpressionLiteral). // 2. Let flags be CodePointsToString(FlagText of RegularExpressionLiteral).
@ -1551,7 +1551,7 @@ inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, String const& p
auto& realm = *vm.current_realm(); auto& realm = *vm.current_realm();
Regex<ECMA262> regex(parsed_regex.regex, parsed_regex.pattern.to_byte_string(), parsed_regex.flags); Regex<ECMA262> regex(parsed_regex.regex, parsed_regex.pattern.to_byte_string(), parsed_regex.flags);
// NOTE: We bypass RegExpCreate and subsequently RegExpAlloc as an optimization to use the already parsed values. // NOTE: We bypass RegExpCreate and subsequently RegExpAlloc as an optimization to use the already parsed values.
auto regexp_object = RegExpObject::create(realm, move(regex), pattern, flags); auto regexp_object = RegExpObject::create(realm, move(regex), move(pattern), move(flags));
// RegExpAlloc has these two steps from the 'Legacy RegExp features' proposal. // RegExpAlloc has these two steps from the 'Legacy RegExp features' proposal.
regexp_object->set_realm(realm); regexp_object->set_realm(realm);
// We don't need to check 'If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true' // We don't need to check 'If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true'
@ -2264,8 +2264,8 @@ void NewRegExp::execute_impl(Bytecode::Interpreter& interpreter) const
new_regexp( new_regexp(
interpreter.vm(), interpreter.vm(),
interpreter.current_executable().regex_table->get(m_regex_index), interpreter.current_executable().regex_table->get(m_regex_index),
interpreter.current_executable().get_string(m_source_index), Utf16String::from_utf8(interpreter.current_executable().get_string(m_source_index)),
interpreter.current_executable().get_string(m_flags_index))); Utf16String::from_utf8(interpreter.current_executable().get_string(m_flags_index))));
} }
#define JS_DEFINE_NEW_BUILTIN_ERROR_OP(ErrorName) \ #define JS_DEFINE_NEW_BUILTIN_ERROR_OP(ErrorName) \

View file

@ -1858,16 +1858,17 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression()
NonnullRefPtr<RegExpLiteral const> Parser::parse_regexp_literal() NonnullRefPtr<RegExpLiteral const> Parser::parse_regexp_literal()
{ {
auto rule_start = push_start(); auto rule_start = push_start();
auto pattern = consume().value().to_utf8_but_should_be_ported_to_utf16(); auto pattern_view = consume().value();
// Remove leading and trailing slash.
pattern = MUST(pattern.substring_from_byte_offset(1, pattern.bytes().size() - 2));
auto flags = String {}; // Remove leading and trailing slash.
auto pattern = Utf16String::from_utf16(pattern_view.substring_view(1, pattern_view.length_in_code_units() - 2));
Utf16String flags {};
auto parsed_flags = RegExpObject::default_flags; auto parsed_flags = RegExpObject::default_flags;
if (match(TokenType::RegexFlags)) { if (match(TokenType::RegexFlags)) {
auto flags_start = position(); auto flags_start = position();
flags = consume().value().to_utf8_but_should_be_ported_to_utf16(); flags = consume().fly_string_value().to_utf16_string();
auto parsed_flags_or_error = regex_flags_from_string(flags); auto parsed_flags_or_error = regex_flags_from_string(flags);
if (parsed_flags_or_error.is_error()) if (parsed_flags_or_error.is_error())
@ -1890,7 +1891,7 @@ NonnullRefPtr<RegExpLiteral const> Parser::parse_regexp_literal()
syntax_error(MUST(String::formatted("RegExp compile error: {}", Regex<ECMA262>(parsed_regex, parsed_pattern.to_byte_string(), parsed_flags).error_string())), rule_start.position()); syntax_error(MUST(String::formatted("RegExp compile error: {}", Regex<ECMA262>(parsed_regex, parsed_pattern.to_byte_string(), parsed_flags).error_string())), rule_start.position());
SourceRange range { m_source_code, rule_start.position(), position() }; SourceRange range { m_source_code, rule_start.position(), position() };
return create_ast_node<RegExpLiteral>(move(range), move(parsed_regex), move(parsed_pattern), move(parsed_flags), move(pattern), move(flags)); return create_ast_node<RegExpLiteral>(move(range), move(parsed_regex), move(parsed_pattern), parsed_flags, move(pattern), move(flags));
} }
static bool is_simple_assignment_target(Expression const& expression, bool allow_web_reality_call_expression = true) static bool is_simple_assignment_target(Expression const& expression, bool allow_web_reality_call_expression = true)

View file

@ -19,12 +19,14 @@ namespace JS {
GC_DEFINE_ALLOCATOR(RegExpObject); GC_DEFINE_ALLOCATOR(RegExpObject);
Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags) Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(Utf16View const& flags)
{ {
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false; bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false;
auto options = RegExpObject::default_flags; auto options = RegExpObject::default_flags;
for (auto ch : flags) { for (size_t index = 0; index < flags.length_in_code_units(); ++index) {
auto ch = flags.code_unit_at(index);
switch (ch) { switch (ch) {
case 'd': case 'd':
if (d) if (d)
@ -88,18 +90,17 @@ Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(Str
} }
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets) ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets)
{ {
if (unicode && unicode_sets) if (unicode && unicode_sets)
return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) }; return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) };
auto utf16_pattern = Utf16String::from_utf8(pattern);
StringBuilder builder; StringBuilder builder;
// FIXME: We need to escape multi-byte code units for LibRegex to parse since the lexer there doesn't handle unicode. // FIXME: We need to escape multi-byte code units for LibRegex to parse since the lexer there doesn't handle unicode.
auto previous_code_unit_was_backslash = false; auto previous_code_unit_was_backslash = false;
for (size_t i = 0; i < utf16_pattern.length_in_code_units(); ++i) { for (size_t i = 0; i < pattern.length_in_code_units(); ++i) {
u16 code_unit = utf16_pattern.code_unit_at(i); u16 code_unit = pattern.code_unit_at(i);
if (code_unit > 0x7f) { if (code_unit > 0x7f) {
// Incorrectly escaping this code unit will result in a wildly different regex than intended // Incorrectly escaping this code unit will result in a wildly different regex than intended
@ -123,7 +124,7 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
} }
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ThrowCompletionOr<String> parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets) ThrowCompletionOr<String> parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets)
{ {
auto result = parse_regex_pattern(pattern, unicode, unicode_sets); auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
if (result.is_error()) if (result.is_error())
@ -137,7 +138,7 @@ GC::Ref<RegExpObject> RegExpObject::create(Realm& realm)
return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype()); return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype());
} }
GC::Ref<RegExpObject> RegExpObject::create(Realm& realm, Regex<ECMA262> regex, String pattern, String flags) GC::Ref<RegExpObject> RegExpObject::create(Realm& realm, Regex<ECMA262> regex, Utf16String pattern, Utf16String flags)
{ {
return realm.create<RegExpObject>(move(regex), move(pattern), move(flags), realm.intrinsics().regexp_prototype()); return realm.create<RegExpObject>(move(regex), move(pattern), move(flags), realm.intrinsics().regexp_prototype());
} }
@ -147,10 +148,12 @@ RegExpObject::RegExpObject(Object& prototype)
{ {
} }
static RegExpObject::Flags to_flag_bits(StringView flags) static RegExpObject::Flags to_flag_bits(Utf16View const& flags)
{ {
RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0); RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
for (auto ch : flags) {
for (size_t i = 0; i < flags.length_in_code_units(); ++i) {
auto ch = flags.code_unit_at(i);
switch (ch) { switch (ch) {
#define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \ #define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
case #flag_char[0]: \ case #flag_char[0]: \
@ -165,7 +168,7 @@ static RegExpObject::Flags to_flag_bits(StringView flags)
return flag_bits; return flag_bits;
} }
RegExpObject::RegExpObject(Regex<ECMA262> regex, String pattern, String flags, Object& prototype) RegExpObject::RegExpObject(Regex<ECMA262> regex, Utf16String pattern, Utf16String flags, Object& prototype)
: Object(ConstructWithPrototypeTag::Tag, prototype) : Object(ConstructWithPrototypeTag::Tag, prototype)
, m_pattern(move(pattern)) , m_pattern(move(pattern))
, m_flags(move(flags)) , m_flags(move(flags))
@ -189,14 +192,14 @@ ThrowCompletionOr<GC::Ref<RegExpObject>> RegExpObject::regexp_initialize(VM& vm,
// 1. If pattern is undefined, let P be the empty String. // 1. If pattern is undefined, let P be the empty String.
// 2. Else, let P be ? ToString(pattern). // 2. Else, let P be ? ToString(pattern).
auto pattern = pattern_value.is_undefined() auto pattern = pattern_value.is_undefined()
? String {} ? Utf16String {}
: TRY(pattern_value.to_string(vm)); : TRY(pattern_value.to_utf16_string(vm));
// 3. If flags is undefined, let F be the empty String. // 3. If flags is undefined, let F be the empty String.
// 4. Else, let F be ? ToString(flags). // 4. Else, let F be ? ToString(flags).
auto flags = flags_value.is_undefined() auto flags = flags_value.is_undefined()
? String {} ? Utf16String {}
: TRY(flags_value.to_string(vm)); : TRY(flags_value.to_utf16_string(vm));
// 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception. // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
// 6. If F contains "i", let i be true; else let i be false. // 6. If F contains "i", let i be true; else let i be false.
@ -271,9 +274,9 @@ String RegExpObject::escape_regexp_pattern() const
// FIXME: Check the 'u' and 'v' flags and escape accordingly // FIXME: Check the 'u' and 'v' flags and escape accordingly
StringBuilder builder; StringBuilder builder;
auto pattern = Utf8View { m_pattern };
auto escaped = false; auto escaped = false;
for (auto code_point : pattern) {
for (auto code_point : m_pattern) {
if (escaped) { if (escaped) {
escaped = false; escaped = false;
builder.append_code_point('\\'); builder.append_code_point('\\');

View file

@ -19,12 +19,12 @@ namespace JS {
JS_API ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_create(VM&, Value pattern, Value flags); JS_API ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_create(VM&, Value pattern, Value flags);
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_alloc(VM&, FunctionObject& new_target); ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_alloc(VM&, FunctionObject& new_target);
Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags); Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(Utf16View const& flags);
struct ParseRegexPatternError { struct ParseRegexPatternError {
String error; String error;
}; };
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets); ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets);
ThrowCompletionOr<String> parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets); ThrowCompletionOr<String> parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets);
class RegExpObject : public Object { class RegExpObject : public Object {
JS_OBJECT(RegExpObject, Object); JS_OBJECT(RegExpObject, Object);
@ -51,7 +51,7 @@ public:
}; };
static GC::Ref<RegExpObject> create(Realm&); static GC::Ref<RegExpObject> create(Realm&);
static GC::Ref<RegExpObject> create(Realm&, Regex<ECMA262> regex, String pattern, String flags); static GC::Ref<RegExpObject> create(Realm&, Regex<ECMA262> regex, Utf16String pattern, Utf16String flags);
ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_initialize(VM&, Value pattern, Value flags); ThrowCompletionOr<GC::Ref<RegExpObject>> regexp_initialize(VM&, Value pattern, Value flags);
String escape_regexp_pattern() const; String escape_regexp_pattern() const;
@ -59,8 +59,8 @@ public:
virtual void initialize(Realm&) override; virtual void initialize(Realm&) override;
virtual ~RegExpObject() override = default; virtual ~RegExpObject() override = default;
String const& pattern() const { return m_pattern; } Utf16String const& pattern() const { return m_pattern; }
String const& flags() const { return m_flags; } Utf16String const& flags() const { return m_flags; }
Flags flag_bits() const { return m_flag_bits; } Flags flag_bits() const { return m_flag_bits; }
Regex<ECMA262> const& regex() { return *m_regex; } Regex<ECMA262> const& regex() { return *m_regex; }
Regex<ECMA262> const& regex() const { return *m_regex; } Regex<ECMA262> const& regex() const { return *m_regex; }
@ -72,13 +72,13 @@ public:
private: private:
RegExpObject(Object& prototype); RegExpObject(Object& prototype);
RegExpObject(Regex<ECMA262> regex, String pattern, String flags, Object& prototype); RegExpObject(Regex<ECMA262> regex, Utf16String pattern, Utf16String flags, Object& prototype);
virtual bool is_regexp_object() const final { return true; } virtual bool is_regexp_object() const final { return true; }
virtual void visit_edges(Visitor&) override; virtual void visit_edges(Visitor&) override;
String m_pattern; Utf16String m_pattern;
String m_flags; Utf16String m_flags;
Flags m_flag_bits { 0 }; Flags m_flag_bits { 0 };
bool m_legacy_features_enabled { false }; // [[LegacyFeaturesEnabled]] bool m_legacy_features_enabled { false }; // [[LegacyFeaturesEnabled]]
// Note: This is initialized in RegExpAlloc, but will be non-null afterwards // Note: This is initialized in RegExpAlloc, but will be non-null afterwards

View file

@ -187,7 +187,7 @@ static ThrowCompletionOr<Value> regexp_builtin_exec(VM& vm, RegExpObject& regexp
// 5. If flags contains "y", let sticky be true; else let sticky be false. // 5. If flags contains "y", let sticky be true; else let sticky be false.
bool sticky = regex.options().has_flag_set(ECMAScriptFlags::Sticky); bool sticky = regex.options().has_flag_set(ECMAScriptFlags::Sticky);
// 6. If flags contains "d", let hasIndices be true, else let hasIndices be false. // 6. If flags contains "d", let hasIndices be true, else let hasIndices be false.
bool has_indices = regexp_object.flags().bytes_as_string_view().find('d').has_value(); bool has_indices = regexp_object.flags().contains('d');
// 7. If global is false and sticky is false, set lastIndex to 0. // 7. If global is false and sticky is false, set lastIndex to 0.
if (!global && !sticky) if (!global && !sticky)

View file

@ -605,6 +605,11 @@ public:
return JS::PrimitiveString::create(m_vm, string); return JS::PrimitiveString::create(m_vm, string);
}; };
auto decode_utf16_string = [&]() {
auto string = m_serialized.decode<Utf16String>();
return JS::PrimitiveString::create(m_vm, string);
};
auto decode_big_int = [&]() { auto decode_big_int = [&]() {
auto string = m_serialized.decode<String>(); auto string = m_serialized.decode<String>();
return JS::BigInt::create(m_vm, MUST(::Crypto::SignedBigInteger::from_base(10, string))); return JS::BigInt::create(m_vm, MUST(::Crypto::SignedBigInteger::from_base(10, string)));
@ -665,8 +670,8 @@ public:
// 11. Otherwise, if serialized.[[Type]] is "RegExp", then set value to a new RegExp object in targetRealm whose [[RegExpMatcher]] internal slot value is serialized.[[RegExpMatcher]], // 11. Otherwise, if serialized.[[Type]] is "RegExp", then set value to a new RegExp object in targetRealm whose [[RegExpMatcher]] internal slot value is serialized.[[RegExpMatcher]],
// whose [[OriginalSource]] internal slot value is serialized.[[OriginalSource]], and whose [[OriginalFlags]] internal slot value is serialized.[[OriginalFlags]]. // whose [[OriginalSource]] internal slot value is serialized.[[OriginalSource]], and whose [[OriginalFlags]] internal slot value is serialized.[[OriginalFlags]].
case ValueTag::RegExpObject: { case ValueTag::RegExpObject: {
auto pattern = decode_string(); auto pattern = decode_utf16_string();
auto flags = decode_string(); auto flags = decode_utf16_string();
value = MUST(JS::regexp_create(m_vm, pattern, flags)); value = MUST(JS::regexp_create(m_vm, pattern, flags));
break; break;