diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 1643acd37b9..a4731e62d34 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -506,10 +506,14 @@ TEST_CASE(ECMA262_parse) { ",(?", regex::Error::InvalidCaptureGroup }, // #4583 { "{1}", regex::Error::InvalidPattern }, { "{1,2}", regex::Error::InvalidPattern }, + { "\\uxxxx", regex::Error::NoError }, + { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { - Regex re(test.pattern); + Regex re(test.pattern, test.flags); EXPECT_EQ(re.parser_result.error, test.expected_error); if constexpr (REGEX_DEBUG) { dbgln("\n"); @@ -586,6 +590,45 @@ TEST_CASE(ECMA262_match) } } +TEST_CASE(ECMA262_unicode_match) +{ + struct _test { + char const* pattern; + char const* subject; + bool matches { true }; + ECMAScriptFlags options {}; + }; + _test tests[] { + { "\\ud83d", "😀", true }, + { "\\ud83d", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ude00", "😀", true }, + { "\\ude00", "😀", false, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ude00", "😀", true }, + { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true }, + { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode }, + }; + + for (auto& test : tests) { + Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options); + + auto subject = AK::utf8_to_utf16(test.subject); + Utf16View view { subject }; + + if constexpr (REGEX_DEBUG) { + dbgln("\n"); + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); + dbgln("\n"); + } + + EXPECT_EQ(re.parser_result.error, Error::NoError); + EXPECT_EQ(re.match(view).success, test.matches); + } +} + TEST_CASE(replace) { struct _test { diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 677ad0cb0db..f5869acbdb6 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; Optional str; + Vector utf16; Vector data; data.ensure_capacity(length); for (size_t i = offset; i < offset + length; ++i) data.unchecked_append(m_bytecode->at(i)); - auto view = input.view.construct_as_same(data, str); + auto view = input.view.construct_as_same(data, str, utf16); offset += length; if (!compare_string(input, state, view, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt auto input_view = input.view.substring_view(state.string_position, 1); Optional str; - auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); + Vector utf16; + auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16); bool equal; if (input.regex_options & AllFlags::Insensitive) equal = input_view.equals_ignoring_case(compare_view); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index b58dc5e1329..6bc58ad78fd 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,11 @@ public: { } + RegexStringView(Utf16View view) + : m_view(view) + { + } + RegexStringView(Utf8View view) : m_view(view) { @@ -58,11 +64,19 @@ public: return m_view.get(); } + Utf16View const& u16_view() const + { + return m_view.get(); + } + Utf8View const& u8_view() const { return m_view.get(); } + bool unicode() const { return m_unicode; } + void set_unicode(bool unicode) { m_unicode = unicode; } + bool is_empty() const { return m_view.visit([](auto& view) { return view.is_empty(); }); @@ -75,12 +89,21 @@ public: size_t length() const { - return m_view.visit([](auto& view) { return view.length(); }); + if (unicode()) { + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_points(); }, + [](auto const& view) { return view.length(); }); + } + + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_units(); }, + [](Utf8View const& view) { return view.byte_length(); }, + [](auto const& view) { return view.length(); }); } - RegexStringView construct_as_same(Span data, Optional& optional_string_storage) const + RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Vector& optional_utf16_storage) const { - return m_view.visit( + auto view = m_view.visit( [&](T const&) { StringBuilder builder; for (auto ch : data) @@ -90,7 +113,14 @@ public: }, [&](Utf32View) { return RegexStringView { Utf32View { data.data(), data.size() } }; + }, + [&](Utf16View) { + optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }); + return RegexStringView { Utf16View { optional_utf16_storage } }; }); + + view.set_unicode(unicode()); + return view; } Vector lines() const @@ -118,6 +148,21 @@ public: views.empend(view); return views; }, + [](Utf16View view) { + Vector views; + u16 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u16); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, [](Utf8View& view) { Vector views; auto it = view.begin(); @@ -147,15 +192,26 @@ public: RegexStringView substring_view(size_t offset, size_t length) const { - return m_view.visit( - [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, - [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + if (unicode()) { + auto view = m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + + view.set_unicode(unicode()); + return view; + } + + auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }); + view.set_unicode(unicode()); + return view; } String to_string() const { return m_view.visit( [](StringView view) { return view.to_string(); }, + [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); }, [](auto& view) { StringBuilder builder; for (auto it = view.begin(); it != view.end(); ++it) @@ -173,8 +229,8 @@ public: return 256u + ch; return ch; }, - [&](auto view) -> u32 { return view[index]; }, - [&](Utf8View& view) -> u32 { + [&](Utf32View& view) -> u32 { return view[index]; }, + [&](auto& view) -> u32 { size_t i = index; for (auto it = view.begin(); it != view.end(); ++it, --i) { if (i == 0) @@ -188,6 +244,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == cstring; }, + [&](Utf16View) { return to_string() == cstring; }, [&](Utf8View const& view) { return view.as_string() == cstring; }, [&](StringView view) { return view == cstring; }); } @@ -201,6 +258,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -209,6 +267,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -224,6 +283,7 @@ public: [&](Utf32View view) { return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; }, + [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } @@ -233,12 +293,25 @@ public: return !(*this == other); } + bool operator==(Utf16View const& other) const + { + return m_view.visit( + [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); }, + [&](Utf16View const& view) { return view == other; }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); + } + + bool operator!=(Utf16View const& other) const + { + return !(*this == other); + } + bool operator==(Utf8View const& other) const { return m_view.visit( - [&](Utf32View) { - return to_string() == other.as_string(); - }, + [&](Utf32View) { return to_string() == other.as_string(); }, + [&](Utf16View) { return to_string() == other.as_string(); }, [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, [&](StringView view) { return other.as_string() == view; }); } @@ -271,6 +344,9 @@ public: [&](Utf32View) -> bool { TODO(); }, + [&](Utf16View) -> bool { + TODO(); + }, [&](Utf8View const& view) { return view.as_string().starts_with(str); }, [&](StringView view) { return view.starts_with(str); }); } @@ -289,6 +365,7 @@ public: } return true; }, + [&](Utf16View) -> bool { TODO(); }, [&](Utf8View const& view) { auto it = view.begin(); for (auto code_point : str) { @@ -304,7 +381,8 @@ public: } private: - Variant m_view; + Variant m_view; + bool m_unicode { false }; }; class Match final { diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 60783b25f6d..f4a848741a4 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -84,6 +84,10 @@ RegexResult Matcher::match(Vector const views, Optional output.operations = 0; size_t lines_to_skip = 0; + bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode); + for (auto& view : views) + const_cast(view).set_unicode(unicode); + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { if (views.size() > 1 && input.start_offset > views.first().length()) { dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 12c62fef78b..07885173b66 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace regex { @@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini if (try_skip("u")) { if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) { - // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode. + // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be + // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit, + // but doesn't form a valid surrogate pair, insert bytecode for both code units individually. + Optional low_surrogate; + if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) { + low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); + if (!low_surrogate.has_value()) { + set_error(Error::InvalidPattern); + return false; + } + + if (Utf16View::is_low_surrogate(*low_surrogate)) { + *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate); + low_surrogate.clear(); + } + } + match_length_minimum += 1; - StringBuilder builder; - builder.append_code_point(code_point.value()); - // FIXME: This isn't actually correct for ECMAScript. - auto u8_encoded = builder.string_view(); - stack.insert_bytecode_compare_string(u8_encoded); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); + + if (low_surrogate.has_value()) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } }); + } + return true; } else if (!unicode) { // '\u' is allowed in non-unicode mode, just matches 'u'.