diff --git a/Libraries/LibJS/Runtime/RegExpConstructor.cpp b/Libraries/LibJS/Runtime/RegExpConstructor.cpp index a0dad384159..b8c2e683b5f 100644 --- a/Libraries/LibJS/Runtime/RegExpConstructor.cpp +++ b/Libraries/LibJS/Runtime/RegExpConstructor.cpp @@ -4,6 +4,9 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include +#include +#include #include #include #include @@ -27,6 +30,8 @@ void RegExpConstructor::initialize(Realm& realm) // 22.2.5.1 RegExp.prototype, https://tc39.es/ecma262/#sec-regexp.prototype define_direct_property(vm.names.prototype, realm.intrinsics().regexp_prototype(), 0); + u8 attr = Attribute::Writable | Attribute::Configurable; + define_native_function(realm, vm.names.escape, escape, 1, attr); define_native_accessor(realm, vm.well_known_symbol_species(), symbol_species_getter, {}, Attribute::Configurable); define_direct_property(vm.names.length, Value(2), Attribute::Configurable); @@ -142,6 +147,109 @@ ThrowCompletionOr> RegExpConstructor::construct(FunctionObject& return TRY(regexp_object->regexp_initialize(vm, pattern_value, flags_value)); } +// 22.2.5.1.1 EncodeForRegExpEscape ( c ), https://tc39.es/proposal-regex-escaping/#sec-encodeforregexpescape +static String encode_for_regexp_escape(u32 code_point) +{ + // https://tc39.es/ecma262/#table-controlescape-code-point-values + // Table 63: ControlEscape Code Point Values + struct ControlEscape { + u32 code_point { 0 }; + char control_escape { 0 }; + }; + static constexpr auto control_escapes = to_array({ + { 0x09, 't' }, + { 0x0A, 'n' }, + { 0x0B, 'v' }, + { 0x0C, 'f' }, + { 0x0D, 'r' }, + }); + + // 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then + if (JS::is_syntax_character(code_point) || code_point == '/') { + // a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and UTF16EncodeCodePoint(c). + return MUST(String::formatted("\\{}", String::from_code_point(code_point))); + } + + // 2. Else if c is the code point listed in some cell of the “Code Point” column of Table 63, then + auto it = find_if(control_escapes.begin(), control_escapes.end(), [&](auto const& escape) { + return escape.code_point == code_point; + }); + + if (it != control_escapes.end()) { + // a. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and the string in the “ControlEscape” column + // of the row whose “Code Point” column contains c. + return MUST(String::formatted("\\{}", it->control_escape)); + } + + // 3. Let otherPunctuators be the string-concatenation of ",-=<>#&!%:;@~'`" and the code unit 0x0022 (QUOTATION MARK). + // 4. Let toEscape be StringToCodePoints(otherPunctuators). + static constexpr Utf8View to_escape { ",-=<>#&!%:;@~'`\""sv }; + + // 5. If toEscape contains c, c is matched by either WhiteSpace or LineTerminator, or c has the same numeric value + // as a leading surrogate or trailing surrogate, then + if (to_escape.contains(code_point) || JS::is_whitespace(code_point) || JS::is_line_terminator(code_point) || is_unicode_surrogate(code_point)) { + // a. Let cNum be the numeric value of c. + // b. If cNum ≤ 0xFF, then + if (code_point <= 0xFF) { + // i. Let hex be Number::toString(𝔽(cNum), 16). + // ii. Return the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), "x", and + // StringPad(hex, 2, "0", START). + return MUST(String::formatted("\\x{:02x}", code_point)); + } + + // c. Let escaped be the empty String. + // d. Let codeUnits be UTF16EncodeCodePoint(c). + // e. For each code unit cu of codeUnits, do + // i. Set escaped to the string-concatenation of escaped and UnicodeEscape(cu). + // f. Return escaped. + return MUST(String::formatted("\\u{:04x}", code_point)); + } + + // 6. Return UTF16EncodeCodePoint(c). + return String::from_code_point(code_point); +} + +// 22.2.5.1 RegExp.escape ( S ), https://tc39.es/proposal-regex-escaping/ +JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::escape) +{ + auto string = vm.argument(0); + + // 1. If S is not a String, throw a TypeError exception. + if (!string.is_string()) + return vm.throw_completion(ErrorType::NotAString, string); + + // 2. Let escaped be the empty String. + StringBuilder escaped(string.as_string().utf8_string().byte_count()); + + // 3. Let cpList be StringToCodePoints(S). + auto code_point_list = string.as_string().utf8_string(); + + // 4. For each code point c of cpList, do + for (auto code_point : code_point_list.code_points()) { + // a. If escaped is the empty String and c is matched by either DecimalDigit or AsciiLetter, then + if (escaped.is_empty() && is_ascii_alphanumeric(code_point)) { + // i. NOTE: Escaping a leading digit ensures that output corresponds with pattern text which may be used + // after a \0 character escape or a DecimalEscape such as \1 and still match S rather than be interpreted + // as an extension of the preceding escape sequence. Escaping a leading ASCII letter does the same for + // the context after \c. + + // ii. Let numericValue be the numeric value of c. + // iii. Let hex be Number::toString(𝔽(numericValue), 16). + // iv. Assert: The length of hex is 2. + // v. Set escaped to the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), "x", and hex. + escaped.appendff("\\x{:02x}", code_point); + } + // b. Else, + else { + // i. Set escaped to the string-concatenation of escaped and EncodeForRegExpEscape(c). + escaped.append(encode_for_regexp_escape(code_point)); + } + } + + // 5. Return escaped. + return JS::PrimitiveString::create(vm, MUST(escaped.to_string())); +} + // 22.2.5.2 get RegExp [ @@species ], https://tc39.es/ecma262/#sec-get-regexp-@@species JS_DEFINE_NATIVE_FUNCTION(RegExpConstructor::symbol_species_getter) { diff --git a/Libraries/LibJS/Runtime/RegExpConstructor.h b/Libraries/LibJS/Runtime/RegExpConstructor.h index 6b3598a1344..eab56c324d7 100644 --- a/Libraries/LibJS/Runtime/RegExpConstructor.h +++ b/Libraries/LibJS/Runtime/RegExpConstructor.h @@ -29,6 +29,7 @@ private: virtual bool has_constructor() const override { return true; } + JS_DECLARE_NATIVE_FUNCTION(escape); JS_DECLARE_NATIVE_FUNCTION(symbol_species_getter); JS_DECLARE_NATIVE_FUNCTION(input_getter); JS_DECLARE_NATIVE_FUNCTION(input_alias_getter); diff --git a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.escape.js b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.escape.js new file mode 100644 index 00000000000..e68da03e737 --- /dev/null +++ b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.escape.js @@ -0,0 +1,68 @@ +describe("errors", () => { + test("invalid string", () => { + expect(() => { + RegExp.escape(Symbol.hasInstance); + }).toThrowWithMessage(TypeError, "Symbol(Symbol.hasInstance) is not a string"); + }); +}); + +describe("normal behavior", () => { + test("first character is alphanumeric", () => { + const alphanumeric = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + for (const ch of alphanumeric) { + const string = `${ch}${ch}${ch}`; + const expected = `\\x${ch.codePointAt(0).toString(16)}${ch}${ch}`; + + expect(RegExp.escape(string)).toBe(expected); + } + }); + + test("syntax characters", () => { + const syntaxCharacters = "^$\\.*+?()[]{}|/"; + + for (const ch of syntaxCharacters) { + const string = `_${ch}_`; + const expected = `_\\${ch}_`; + + expect(RegExp.escape(string)).toBe(expected); + } + }); + + test("control characters", () => { + expect(RegExp.escape("_\t_")).toBe("_\\t_"); + expect(RegExp.escape("_\n_")).toBe("_\\n_"); + expect(RegExp.escape("_\v_")).toBe("_\\v_"); + expect(RegExp.escape("_\f_")).toBe("_\\f_"); + expect(RegExp.escape("_\r_")).toBe("_\\r_"); + }); + + test("punctuators", () => { + const punctuators = ",-=<>#&!%:;@~'`\""; + + for (const ch of punctuators) { + const string = `_${ch}_`; + const expected = `_\\x${ch.codePointAt(0).toString(16)}_`; + + expect(RegExp.escape(string)).toBe(expected); + } + }); + + test("non-ASCII whitespace", () => { + const nbsp = "\u00A0"; + + expect(RegExp.escape("\u00A0")).toBe("\\xa0"); + expect(RegExp.escape("\uFEFF")).toBe("\\ufeff"); + expect(RegExp.escape("\u2028")).toBe("\\u2028"); + expect(RegExp.escape("\u2029")).toBe("\\u2029"); + }); + + test("Unicode surrogates", () => { + for (let ch = 0xd800; ch <= 0xdfff; ++ch) { + const string = String.fromCodePoint(ch); + const expected = `\\u${ch.toString(16)}`; + + expect(RegExp.escape(string)).toBe(expected); + } + }); +});