mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-09-13 13:02:28 +00:00
LibJS: Do not directly append RegExp pattern code points during parse
Some checks are pending
CI / macOS, arm64, Sanitizer, Clang (push) Waiting to run
CI / Linux, x86_64, Fuzzers, Clang (push) Waiting to run
CI / Linux, x86_64, Sanitizer, GNU (push) Waiting to run
CI / Linux, x86_64, Sanitizer, Clang (push) Waiting to run
Package the js repl as a binary artifact / Linux, arm64 (push) Waiting to run
Package the js repl as a binary artifact / macOS, arm64 (push) Waiting to run
Package the js repl as a binary artifact / Linux, x86_64 (push) Waiting to run
Run test262 and test-wasm / run_and_update_results (push) Waiting to run
Lint Code / lint (push) Waiting to run
Label PRs with merge conflicts / auto-labeler (push) Waiting to run
Push notes / build (push) Waiting to run
Some checks are pending
CI / macOS, arm64, Sanitizer, Clang (push) Waiting to run
CI / Linux, x86_64, Fuzzers, Clang (push) Waiting to run
CI / Linux, x86_64, Sanitizer, GNU (push) Waiting to run
CI / Linux, x86_64, Sanitizer, Clang (push) Waiting to run
Package the js repl as a binary artifact / Linux, arm64 (push) Waiting to run
Package the js repl as a binary artifact / macOS, arm64 (push) Waiting to run
Package the js repl as a binary artifact / Linux, x86_64 (push) Waiting to run
Run test262 and test-wasm / run_and_update_results (push) Waiting to run
Lint Code / lint (push) Waiting to run
Label PRs with merge conflicts / auto-labeler (push) Waiting to run
Push notes / build (push) Waiting to run
There apparently is a bit of a disconnect between the spec asking us to construct the pattern using code points and LibRegex not being able to swallow those. Whenever we had multi-byte code points in the pattern and tried to match that in unicode mode, we would fail. Change the parser to encode all non-ASCII code units. Fixes 2 test262 cases in `language/literals/regexp`.
This commit is contained in:
parent
7f6b70fafb
commit
5d19aacce7
Notes:
github-actions[bot]
2025-07-21 23:25:00 +00:00
Author: https://github.com/gmta
Commit: 5d19aacce7
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5548
Reviewed-by: https://github.com/alimpfard ✅
Reviewed-by: https://github.com/shannonbooth
2 changed files with 18 additions and 11 deletions
|
@ -96,19 +96,10 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
|
|||
auto utf16_pattern = Utf16String::from_utf8(pattern);
|
||||
StringBuilder builder;
|
||||
|
||||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
||||
// FIXME: We need to escape multi-byte code units for LibRegex to parse since the lexer there doesn't handle unicode.
|
||||
auto previous_code_unit_was_backslash = false;
|
||||
for (size_t i = 0; i < utf16_pattern.length_in_code_units();) {
|
||||
if (unicode || unicode_sets) {
|
||||
auto code_point = code_point_at(utf16_pattern, i);
|
||||
builder.append_code_point(code_point.code_point);
|
||||
i += code_point.code_unit_count;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < utf16_pattern.length_in_code_units(); ++i) {
|
||||
u16 code_unit = utf16_pattern.code_unit_at(i);
|
||||
++i;
|
||||
|
||||
if (code_unit > 0x7f) {
|
||||
// Incorrectly escaping this code unit will result in a wildly different regex than intended
|
||||
|
|
|
@ -81,3 +81,19 @@ test("v flag should enable unicode mode", () => {
|
|||
test("parsing a large bytestring shouldn't crash", () => {
|
||||
RegExp(new Uint8Array(0x40000));
|
||||
});
|
||||
|
||||
test("Unicode non-ASCII matching", () => {
|
||||
const cases = [
|
||||
{ pattern: /é/u, match: "é", expected: ["é"] },
|
||||
{ pattern: /é/, match: "é", expected: ["é"] },
|
||||
{ pattern: /\u{61}/u, match: "a", expected: ["a"] },
|
||||
{ pattern: /\u{61}/, match: "a", expected: null },
|
||||
{ pattern: /😄/u, match: "😄", expected: ["😄"] },
|
||||
{ pattern: /😄/u, match: "\ud83d", expected: null },
|
||||
{ pattern: /😄/, match: "\ud83d", expected: null },
|
||||
];
|
||||
for (const test of cases) {
|
||||
const result = test.match.match(test.pattern);
|
||||
expect(result).toEqual(test.expected);
|
||||
}
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue