AK+Everywhere: Allow lonely UTF-16 surrogates by default

By definition, the web allows lonely surrogates by default. Let's have
our string APIs reflect this, so we don't have to pass an allow option
all over the place.
This commit is contained in:
Timothy Flynn 2025-06-26 19:52:09 -04:00 committed by Tim Flynn
commit 9fc3e72db2
Notes: github-actions[bot] 2025-07-03 13:53:17 +00:00
14 changed files with 74 additions and 77 deletions

View file

@ -185,16 +185,16 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
return substring_view(substring_start, substring_length);
}
bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const
bool Utf8View::validate(size_t& valid_bytes, AllowLonelySurrogates allow_lonely_surrogates) const
{
auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
valid_bytes = result.count;
if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) {
if (result.error == simdutf::SURROGATE && allow_lonely_surrogates == AllowLonelySurrogates::Yes) {
valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.
size_t substring_valid_bytes = 0;
auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates);
auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_lonely_surrogates);
valid_bytes += substring_valid_bytes;
return is_valid;