AK+LibUnicode: Implement a case-insensitive variant of find_byte_offset

The existing String::find_byte_offset is case-sensitive. This variant
allows performing searches using Unicode-aware case folding.
This commit is contained in:
Timothy Flynn 2024-05-31 15:51:40 -04:00 committed by Andreas Kling
commit fe3fde2411
Notes: sideshowbarker 2024-07-16 21:30:46 +09:00
5 changed files with 188 additions and 0 deletions

View file

@ -6,6 +6,7 @@
#include <AK/CharacterTypes.h>
#include <AK/Platform.h>
#include <AK/ScopeGuard.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Utf16View.h>
@ -55,6 +56,18 @@ public:
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
}
size_t index() const
{
if constexpr (IsSame<ViewType, Utf8View>)
return m_string.byte_offset_of(m_it);
else if constexpr (IsSame<ViewType, Utf16View>)
return m_string.code_unit_offset_of(m_it);
else if constexpr (IsSame<ViewType, Utf32View>)
return m_string.iterator_offset(m_it);
else
static_assert(DependentFalse<ViewType>);
}
u32 next_code_point()
{
VERIFY(has_more_data());
@ -103,6 +116,38 @@ template bool equals_ignoring_case(Utf8View, Utf8View);
template bool equals_ignoring_case(Utf16View, Utf16View);
template bool equals_ignoring_case(Utf32View, Utf32View);
template<typename ViewType>
Optional<size_t> find_ignoring_case(ViewType lhs, ViewType rhs)
{
CasefoldStringComparator lhs_comparator { lhs };
while (lhs_comparator.has_more_data()) {
CasefoldStringComparator rhs_comparator { rhs };
auto saved_state = lhs_comparator;
auto matches = true;
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) {
matches = false;
break;
}
}
if (matches && !rhs_comparator.has_more_data())
return saved_state.index();
lhs_comparator = move(saved_state);
lhs_comparator.next_code_point();
}
return {};
}
template Optional<size_t> find_ignoring_case(Utf8View, Utf8View);
template Optional<size_t> find_ignoring_case(Utf16View, Utf16View);
template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }

View file

@ -51,6 +51,9 @@ u32 to_unicode_titlecase(u32 code_point);
template<typename ViewType>
bool equals_ignoring_case(ViewType, ViewType);
template<typename ViewType>
Optional<size_t> find_ignoring_case(ViewType, ViewType);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);

View file

@ -47,4 +47,14 @@ bool String::equals_ignoring_case(String const& other) const
return Unicode::equals_ignoring_case(code_points(), other.code_points());
}
Optional<size_t> String::find_byte_offset_ignoring_case(StringView needle, size_t from_byte_offset) const
{
auto haystack = code_points().substring_view(from_byte_offset);
if (auto index = Unicode::find_ignoring_case(haystack, Utf8View { needle }); index.has_value())
return *index + from_byte_offset;
return {};
}
}