diff --git a/Libraries/LibJS/AST.h b/Libraries/LibJS/AST.h index 8d87033230f..824c0c28031 100644 --- a/Libraries/LibJS/AST.h +++ b/Libraries/LibJS/AST.h @@ -1340,7 +1340,7 @@ private: class RegExpLiteral final : public Expression { public: - RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions parsed_flags, String pattern, String flags) + RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions parsed_flags, Utf16String pattern, Utf16String flags) : Expression(move(source_range)) , m_parsed_regex(move(parsed_regex)) , m_parsed_pattern(move(parsed_pattern)) @@ -1356,15 +1356,15 @@ public: regex::Parser::Result const& parsed_regex() const { return m_parsed_regex; } String const& parsed_pattern() const { return m_parsed_pattern; } regex::RegexOptions const& parsed_flags() const { return m_parsed_flags; } - String const& pattern() const { return m_pattern; } - String const& flags() const { return m_flags; } + Utf16String const& pattern() const { return m_pattern; } + Utf16String const& flags() const { return m_flags; } private: regex::Parser::Result m_parsed_regex; String m_parsed_pattern; regex::RegexOptions m_parsed_flags; - String m_pattern; - String m_flags; + Utf16String m_pattern; + Utf16String m_flags; }; class PrivateIdentifier final : public Expression { diff --git a/Libraries/LibJS/Bytecode/ASTCodegen.cpp b/Libraries/LibJS/Bytecode/ASTCodegen.cpp index 2366ed923ac..baae42a262d 100644 --- a/Libraries/LibJS/Bytecode/ASTCodegen.cpp +++ b/Libraries/LibJS/Bytecode/ASTCodegen.cpp @@ -444,8 +444,8 @@ Bytecode::CodeGenerationErrorOr> StringLiteral::generate Bytecode::CodeGenerationErrorOr> RegExpLiteral::generate_bytecode(Bytecode::Generator& generator, Optional preferred_dst) const { Bytecode::Generator::SourceLocationScope scope(generator, *this); - auto source_index = generator.intern_string(m_pattern); - auto flags_index = generator.intern_string(m_flags); + auto source_index = generator.intern_string(m_pattern.to_utf8_but_should_be_ported_to_utf16()); + auto flags_index = generator.intern_string(m_flags.to_utf8_but_should_be_ported_to_utf16()); auto regex_index = generator.intern_regex(Bytecode::ParsedRegex { .regex = m_parsed_regex, .pattern = m_parsed_pattern, diff --git a/Libraries/LibJS/Bytecode/Interpreter.cpp b/Libraries/LibJS/Bytecode/Interpreter.cpp index c784cecac74..90ba480f392 100644 --- a/Libraries/LibJS/Bytecode/Interpreter.cpp +++ b/Libraries/LibJS/Bytecode/Interpreter.cpp @@ -1542,7 +1542,7 @@ inline ThrowCompletionOr get_callee_and_this_from_environment(Byt } // 13.2.7.3 Runtime Semantics: Evaluation, https://tc39.es/ecma262/#sec-regular-expression-literals-runtime-semantics-evaluation -inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, String const& pattern, String const& flags) +inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, Utf16String pattern, Utf16String flags) { // 1. Let pattern be CodePointsToString(BodyText of RegularExpressionLiteral). // 2. Let flags be CodePointsToString(FlagText of RegularExpressionLiteral). @@ -1551,7 +1551,7 @@ inline Value new_regexp(VM& vm, ParsedRegex const& parsed_regex, String const& p auto& realm = *vm.current_realm(); Regex regex(parsed_regex.regex, parsed_regex.pattern.to_byte_string(), parsed_regex.flags); // NOTE: We bypass RegExpCreate and subsequently RegExpAlloc as an optimization to use the already parsed values. - auto regexp_object = RegExpObject::create(realm, move(regex), pattern, flags); + auto regexp_object = RegExpObject::create(realm, move(regex), move(pattern), move(flags)); // RegExpAlloc has these two steps from the 'Legacy RegExp features' proposal. regexp_object->set_realm(realm); // We don't need to check 'If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true' @@ -2264,8 +2264,8 @@ void NewRegExp::execute_impl(Bytecode::Interpreter& interpreter) const new_regexp( interpreter.vm(), interpreter.current_executable().regex_table->get(m_regex_index), - interpreter.current_executable().get_string(m_source_index), - interpreter.current_executable().get_string(m_flags_index))); + Utf16String::from_utf8(interpreter.current_executable().get_string(m_source_index)), + Utf16String::from_utf8(interpreter.current_executable().get_string(m_flags_index)))); } #define JS_DEFINE_NEW_BUILTIN_ERROR_OP(ErrorName) \ diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp index 5699d14153b..e2fd95006f2 100644 --- a/Libraries/LibJS/Parser.cpp +++ b/Libraries/LibJS/Parser.cpp @@ -1858,16 +1858,17 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression() NonnullRefPtr Parser::parse_regexp_literal() { auto rule_start = push_start(); - auto pattern = consume().value().to_utf8_but_should_be_ported_to_utf16(); - // Remove leading and trailing slash. - pattern = MUST(pattern.substring_from_byte_offset(1, pattern.bytes().size() - 2)); + auto pattern_view = consume().value(); - auto flags = String {}; + // Remove leading and trailing slash. + auto pattern = Utf16String::from_utf16(pattern_view.substring_view(1, pattern_view.length_in_code_units() - 2)); + + Utf16String flags {}; auto parsed_flags = RegExpObject::default_flags; if (match(TokenType::RegexFlags)) { auto flags_start = position(); - flags = consume().value().to_utf8_but_should_be_ported_to_utf16(); + flags = consume().fly_string_value().to_utf16_string(); auto parsed_flags_or_error = regex_flags_from_string(flags); if (parsed_flags_or_error.is_error()) @@ -1890,7 +1891,7 @@ NonnullRefPtr Parser::parse_regexp_literal() syntax_error(MUST(String::formatted("RegExp compile error: {}", Regex(parsed_regex, parsed_pattern.to_byte_string(), parsed_flags).error_string())), rule_start.position()); SourceRange range { m_source_code, rule_start.position(), position() }; - return create_ast_node(move(range), move(parsed_regex), move(parsed_pattern), move(parsed_flags), move(pattern), move(flags)); + return create_ast_node(move(range), move(parsed_regex), move(parsed_pattern), parsed_flags, move(pattern), move(flags)); } static bool is_simple_assignment_target(Expression const& expression, bool allow_web_reality_call_expression = true) diff --git a/Libraries/LibJS/Runtime/RegExpObject.cpp b/Libraries/LibJS/Runtime/RegExpObject.cpp index 57caf5bfc39..33745f3aca7 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -19,12 +19,14 @@ namespace JS { GC_DEFINE_ALLOCATOR(RegExpObject); -Result, String> regex_flags_from_string(StringView flags) +Result, String> regex_flags_from_string(Utf16View const& flags) { bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false; auto options = RegExpObject::default_flags; - for (auto ch : flags) { + for (size_t index = 0; index < flags.length_in_code_units(); ++index) { + auto ch = flags.code_unit_at(index); + switch (ch) { case 'd': if (d) @@ -88,18 +90,17 @@ Result, String> regex_flags_from_string(Str } // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern -ErrorOr parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets) +ErrorOr parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets) { if (unicode && unicode_sets) return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) }; - auto utf16_pattern = Utf16String::from_utf8(pattern); StringBuilder builder; // FIXME: We need to escape multi-byte code units for LibRegex to parse since the lexer there doesn't handle unicode. auto previous_code_unit_was_backslash = false; - for (size_t i = 0; i < utf16_pattern.length_in_code_units(); ++i) { - u16 code_unit = utf16_pattern.code_unit_at(i); + for (size_t i = 0; i < pattern.length_in_code_units(); ++i) { + u16 code_unit = pattern.code_unit_at(i); if (code_unit > 0x7f) { // Incorrectly escaping this code unit will result in a wildly different regex than intended @@ -123,7 +124,7 @@ ErrorOr parse_regex_pattern(StringView pattern, } // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern -ThrowCompletionOr parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets) +ThrowCompletionOr parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets) { auto result = parse_regex_pattern(pattern, unicode, unicode_sets); if (result.is_error()) @@ -137,7 +138,7 @@ GC::Ref RegExpObject::create(Realm& realm) return realm.create(realm.intrinsics().regexp_prototype()); } -GC::Ref RegExpObject::create(Realm& realm, Regex regex, String pattern, String flags) +GC::Ref RegExpObject::create(Realm& realm, Regex regex, Utf16String pattern, Utf16String flags) { return realm.create(move(regex), move(pattern), move(flags), realm.intrinsics().regexp_prototype()); } @@ -147,10 +148,12 @@ RegExpObject::RegExpObject(Object& prototype) { } -static RegExpObject::Flags to_flag_bits(StringView flags) +static RegExpObject::Flags to_flag_bits(Utf16View const& flags) { RegExpObject::Flags flag_bits = static_cast(0); - for (auto ch : flags) { + + for (size_t i = 0; i < flags.length_in_code_units(); ++i) { + auto ch = flags.code_unit_at(i); switch (ch) { #define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \ case #flag_char[0]: \ @@ -165,7 +168,7 @@ static RegExpObject::Flags to_flag_bits(StringView flags) return flag_bits; } -RegExpObject::RegExpObject(Regex regex, String pattern, String flags, Object& prototype) +RegExpObject::RegExpObject(Regex regex, Utf16String pattern, Utf16String flags, Object& prototype) : Object(ConstructWithPrototypeTag::Tag, prototype) , m_pattern(move(pattern)) , m_flags(move(flags)) @@ -189,14 +192,14 @@ ThrowCompletionOr> RegExpObject::regexp_initialize(VM& vm, // 1. If pattern is undefined, let P be the empty String. // 2. Else, let P be ? ToString(pattern). auto pattern = pattern_value.is_undefined() - ? String {} - : TRY(pattern_value.to_string(vm)); + ? Utf16String {} + : TRY(pattern_value.to_utf16_string(vm)); // 3. If flags is undefined, let F be the empty String. // 4. Else, let F be ? ToString(flags). auto flags = flags_value.is_undefined() - ? String {} - : TRY(flags_value.to_string(vm)); + ? Utf16String {} + : TRY(flags_value.to_utf16_string(vm)); // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception. // 6. If F contains "i", let i be true; else let i be false. @@ -271,9 +274,9 @@ String RegExpObject::escape_regexp_pattern() const // FIXME: Check the 'u' and 'v' flags and escape accordingly StringBuilder builder; - auto pattern = Utf8View { m_pattern }; auto escaped = false; - for (auto code_point : pattern) { + + for (auto code_point : m_pattern) { if (escaped) { escaped = false; builder.append_code_point('\\'); diff --git a/Libraries/LibJS/Runtime/RegExpObject.h b/Libraries/LibJS/Runtime/RegExpObject.h index b0958610099..dd24057d08d 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.h +++ b/Libraries/LibJS/Runtime/RegExpObject.h @@ -19,12 +19,12 @@ namespace JS { JS_API ThrowCompletionOr> regexp_create(VM&, Value pattern, Value flags); ThrowCompletionOr> regexp_alloc(VM&, FunctionObject& new_target); -Result, String> regex_flags_from_string(StringView flags); +Result, String> regex_flags_from_string(Utf16View const& flags); struct ParseRegexPatternError { String error; }; -ErrorOr parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets); -ThrowCompletionOr parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets); +ErrorOr parse_regex_pattern(Utf16View const& pattern, bool unicode, bool unicode_sets); +ThrowCompletionOr parse_regex_pattern(VM& vm, Utf16View const& pattern, bool unicode, bool unicode_sets); class RegExpObject : public Object { JS_OBJECT(RegExpObject, Object); @@ -51,7 +51,7 @@ public: }; static GC::Ref create(Realm&); - static GC::Ref create(Realm&, Regex regex, String pattern, String flags); + static GC::Ref create(Realm&, Regex regex, Utf16String pattern, Utf16String flags); ThrowCompletionOr> regexp_initialize(VM&, Value pattern, Value flags); String escape_regexp_pattern() const; @@ -59,8 +59,8 @@ public: virtual void initialize(Realm&) override; virtual ~RegExpObject() override = default; - String const& pattern() const { return m_pattern; } - String const& flags() const { return m_flags; } + Utf16String const& pattern() const { return m_pattern; } + Utf16String const& flags() const { return m_flags; } Flags flag_bits() const { return m_flag_bits; } Regex const& regex() { return *m_regex; } Regex const& regex() const { return *m_regex; } @@ -72,13 +72,13 @@ public: private: RegExpObject(Object& prototype); - RegExpObject(Regex regex, String pattern, String flags, Object& prototype); + RegExpObject(Regex regex, Utf16String pattern, Utf16String flags, Object& prototype); virtual bool is_regexp_object() const final { return true; } virtual void visit_edges(Visitor&) override; - String m_pattern; - String m_flags; + Utf16String m_pattern; + Utf16String m_flags; Flags m_flag_bits { 0 }; bool m_legacy_features_enabled { false }; // [[LegacyFeaturesEnabled]] // Note: This is initialized in RegExpAlloc, but will be non-null afterwards diff --git a/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Libraries/LibJS/Runtime/RegExpPrototype.cpp index e8a82d3e6c9..64c1ffd8af9 100644 --- a/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -187,7 +187,7 @@ static ThrowCompletionOr regexp_builtin_exec(VM& vm, RegExpObject& regexp // 5. If flags contains "y", let sticky be true; else let sticky be false. bool sticky = regex.options().has_flag_set(ECMAScriptFlags::Sticky); // 6. If flags contains "d", let hasIndices be true, else let hasIndices be false. - bool has_indices = regexp_object.flags().bytes_as_string_view().find('d').has_value(); + bool has_indices = regexp_object.flags().contains('d'); // 7. If global is false and sticky is false, set lastIndex to 0. if (!global && !sticky) diff --git a/Libraries/LibWeb/HTML/StructuredSerialize.cpp b/Libraries/LibWeb/HTML/StructuredSerialize.cpp index a4889719e50..25a329f5075 100644 --- a/Libraries/LibWeb/HTML/StructuredSerialize.cpp +++ b/Libraries/LibWeb/HTML/StructuredSerialize.cpp @@ -605,6 +605,11 @@ public: return JS::PrimitiveString::create(m_vm, string); }; + auto decode_utf16_string = [&]() { + auto string = m_serialized.decode(); + return JS::PrimitiveString::create(m_vm, string); + }; + auto decode_big_int = [&]() { auto string = m_serialized.decode(); return JS::BigInt::create(m_vm, MUST(::Crypto::SignedBigInteger::from_base(10, string))); @@ -665,8 +670,8 @@ public: // 11. Otherwise, if serialized.[[Type]] is "RegExp", then set value to a new RegExp object in targetRealm whose [[RegExpMatcher]] internal slot value is serialized.[[RegExpMatcher]], // whose [[OriginalSource]] internal slot value is serialized.[[OriginalSource]], and whose [[OriginalFlags]] internal slot value is serialized.[[OriginalFlags]]. case ValueTag::RegExpObject: { - auto pattern = decode_string(); - auto flags = decode_string(); + auto pattern = decode_utf16_string(); + auto flags = decode_utf16_string(); value = MUST(JS::regexp_create(m_vm, pattern, flags)); break;