From 81fc8ab8ccd308b5c083054564b1ea301e2ee73f Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 21 Jul 2025 14:08:32 -0400 Subject: [PATCH] LibRegex: Rename a couple of RegexStringView methods for clarity `operator[]` -> `code_point_at` `code_unit_at` -> `unicode_aware_code_point_at` `unicode_aware_code_point_at` returns either a code point or a code unit depending on the Unicode flag. --- Libraries/LibRegex/RegexByteCode.cpp | 34 ++++++++++++++-------------- Libraries/LibRegex/RegexMatch.h | 12 +++++----- Libraries/LibRegex/RegexMatcher.cpp | 2 +- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index 76f047f7460..17417dd7059 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -116,7 +116,7 @@ static void advance_string_position(MatchState& state, RegexStringView view, Opt if (view.unicode()) { if (!code_point.has_value() && (state.string_position_in_code_units < view.length_in_code_units())) - code_point = view[state.string_position_in_code_units]; + code_point = view.code_point_at(state.string_position_in_code_units); if (code_point.has_value()) state.string_position_in_code_units += view.length_of_code_point(*code_point); } else { @@ -282,7 +282,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input return true; if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { - auto input_view = input.view.substring_view(state.string_position - 1, 1)[0]; + auto input_view = input.view.substring_view(state.string_position - 1, 1).code_point_at(0); return input_view == '\r' || input_view == '\n' || input_view == LineSeparator || input_view == ParagraphSeparator; } @@ -304,14 +304,14 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; }; auto is_word_boundary = [&] { if (state.string_position == input.view.length()) { - return (state.string_position > 0 && isword(input.view[state.string_position_in_code_units - 1])); + return (state.string_position > 0 && isword(input.view.code_point_at(state.string_position_in_code_units - 1))); } if (state.string_position == 0) { - return (isword(input.view[0])); + return (isword(input.view.code_point_at(0))); } - return !!(isword(input.view[state.string_position_in_code_units]) ^ isword(input.view[state.string_position_in_code_units - 1])); + return !!(isword(input.view.code_point_at(state.string_position_in_code_units)) ^ isword(input.view.code_point_at(state.string_position_in_code_units - 1))); }; switch (type()) { case BoundaryCheckType::Word: { @@ -335,7 +335,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, return true; if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { - auto input_view = input.view.substring_view(state.string_position, 1)[0]; + auto input_view = input.view.substring_view(state.string_position, 1).code_point_at(0); return input_view == '\r' || input_view == '\n' || input_view == LineSeparator || input_view == ParagraphSeparator; } @@ -488,7 +488,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; - auto input_view = input.view.substring_view(state.string_position, 1)[0]; + auto input_view = input.view.substring_view(state.string_position, 1).code_point_at(0); auto is_equivalent_to_newline = input_view == '\n' || (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics) ? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator) @@ -531,7 +531,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; auto character_class = (CharClass)m_bytecode->at(offset++); - auto ch = input.view.code_unit_at(state.string_position_in_code_units); + auto ch = input.view.unicode_aware_code_point_at(state.string_position_in_code_units); compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); break; @@ -548,7 +548,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M offset += count_insensitive; bool const insensitive = input.regex_options & AllFlags::Insensitive; - auto ch = input.view.code_unit_at(state.string_position_in_code_units); + auto ch = input.view.unicode_aware_code_point_at(state.string_position_in_code_units); if (insensitive) ch = to_ascii_lowercase(ch); @@ -578,7 +578,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto from = value.from; auto to = value.to; - auto ch = input.view.code_unit_at(state.string_position_in_code_units); + auto ch = input.view.unicode_aware_code_point_at(state.string_position_in_code_units); compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched); break; @@ -711,8 +711,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt // FIXME: Figure out how to do this if unicode() without performing a substring split first. auto input_view = input.view.unicode() - ? input.view.substring_view(state.string_position, 1)[0] - : input.view.code_unit_at(state.string_position_in_code_units); + ? input.view.substring_view(state.string_position, 1).code_point_at(0) + : input.view.unicode_aware_code_point_at(state.string_position_in_code_units); bool equal; if (input.regex_options & AllFlags::Insensitive) { @@ -753,7 +753,7 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match if (str.length() == 1) { auto inverse_matched = false; - compare_char(input, state, str[0], false, inverse_matched); + compare_char(input, state, str.code_point_at(0), false, inverse_matched); return !inverse_matched; } @@ -843,7 +843,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position_in_code_units]; + u32 code_point = input.view.code_point_at(state.string_position_in_code_units); bool equal = Unicode::code_point_has_property(code_point, property); if (equal) { @@ -859,7 +859,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position_in_code_units]; + u32 code_point = input.view.code_point_at(state.string_position_in_code_units); bool equal = Unicode::code_point_has_general_category(code_point, general_category); if (equal) { @@ -875,7 +875,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, Match if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position_in_code_units]; + u32 code_point = input.view.code_point_at(state.string_position_in_code_units); bool equal = Unicode::code_point_has_script(code_point, script); if (equal) { @@ -891,7 +891,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& in if (state.string_position == input.view.length()) return; - u32 code_point = input.view[state.string_position_in_code_units]; + u32 code_point = input.view.code_point_at(state.string_position_in_code_units); bool equal = Unicode::code_point_has_script_extension(code_point, script); if (equal) { diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index ddbc36fee83..63cb7785843 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -201,25 +201,25 @@ public: }); } - // Note: index must always be the code unit offset to return. - u32 operator[](size_t index) const + u32 code_point_at(size_t code_unit_index) const { return m_view.visit( [&](StringView view) -> u32 { - auto ch = view[index]; + auto ch = view[code_unit_index]; if constexpr (IsSigned) { if (ch < 0) return 256u + ch; return ch; } }, - [&](Utf16View const& view) -> u32 { return view.code_point_at(index); }); + [&](Utf16View const& view) -> u32 { return view.code_point_at(code_unit_index); }); } - u32 code_unit_at(size_t code_unit_index) const + // Returns the code point at the code unit offset if the Unicode flag is set. Otherwise, returns the code unit. + u32 unicode_aware_code_point_at(size_t code_unit_index) const { if (unicode()) - return operator[](code_unit_index); + return code_point_at(code_unit_index); return m_view.visit( [&](StringView view) -> u32 { diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index fc5fc858df1..2c0e7c0853c 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -291,7 +291,7 @@ RegexResult Matcher::match(Vector const& views, Optiona auto const insensitive = input.regex_options.has_flag_set(AllFlags::Insensitive); if (auto& starting_ranges = m_pattern->parser_result.optimization_data.starting_ranges; !starting_ranges.is_empty()) { auto ranges = insensitive ? m_pattern->parser_result.optimization_data.starting_ranges_insensitive.span() : starting_ranges.span(); - auto ch = input.view.code_unit_at(view_index); + auto ch = input.view.unicode_aware_code_point_at(view_index); if (insensitive) ch = to_ascii_lowercase(ch);