AK+LibTextCodec: Stop using Utf16View endianness override

This is preparation for removing the endianness override, since it was
only used by a single client: LibTextCodec.

While here, add helpers and make use of simdutf for fast conversion.
This commit is contained in:
Andreas Kling 2025-04-15 17:49:09 +02:00
parent 7029957a6b
commit e218cd3766
7 changed files with 56 additions and 53 deletions

View file

@ -61,6 +61,36 @@ ErrorOr<String> String::from_utf8(StringView view)
return result;
}
ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
{
if (!validate_utf16_le(bytes))
return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
if (bytes.is_empty())
return String {};
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
size_t utf16_length = bytes.size() / 2;
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
Vector<u8> buffer;
buffer.resize(max_utf8_length);
auto utf8_length = simdutf::convert_utf16le_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
}
ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
{
if (!validate_utf16_be(bytes))
return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
if (bytes.is_empty())
return String {};
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
size_t utf16_length = bytes.size() / 2;
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
Vector<u8> buffer;
buffer.resize(max_utf8_length);
auto utf8_length = simdutf::convert_utf16be_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
}
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{
if (!utf16.validate())

View file

@ -70,6 +70,8 @@ public:
// Creates a new String from a sequence of UTF-16 encoded code points.
static ErrorOr<String> from_utf16(Utf16View const&);
static ErrorOr<String> from_utf16_le(ReadonlyBytes);
static ErrorOr<String> from_utf16_be(ReadonlyBytes);
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
static ErrorOr<String> from_stream(Stream&, size_t byte_count);

View file

@ -421,4 +421,14 @@ size_t Utf16CodePointIterator::length_in_code_units() const
return *(*this) < first_supplementary_plane_code_point ? 1 : 2;
}
bool validate_utf16_le(ReadonlyBytes bytes)
{
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
bool validate_utf16_be(ReadonlyBytes bytes)
{
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
}

View file

@ -26,6 +26,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host)
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
size_t utf16_code_unit_length_from_utf8(StringView);
class Utf16View;

View file

@ -369,25 +369,9 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
return String::from_utf8_with_replacement_character(input);
}
static Utf16View as_utf16(StringView view, AK::Endianness endianness)
{
return Utf16View {
{ reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
endianness
};
}
ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
{
for (auto code_point : as_utf16(input, AK::Endianness::Big))
TRY(on_code_point(code_point));
return {};
}
bool UTF16BEDecoder::validate(StringView input)
{
return as_utf16(input, AK::Endianness::Big).validate();
return AK::validate_utf16_be(input.bytes());
}
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
@ -396,20 +380,12 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
input = input.substring_view(2);
return String::from_utf16(as_utf16(input, AK::Endianness::Big));
}
ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
{
for (auto code_point : as_utf16(input, AK::Endianness::Little))
TRY(on_code_point(code_point));
return {};
return String::from_utf16_be(input.bytes());
}
bool UTF16LEDecoder::validate(StringView input)
{
return as_utf16(input, AK::Endianness::Little).validate();
return AK::validate_utf16_le(input.bytes());
}
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
@ -418,7 +394,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
input = input.substring_view(2);
return String::from_utf16(as_utf16(input, AK::Endianness::Little));
return String::from_utf16_le(input.bytes());
}
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)

View file

@ -17,12 +17,12 @@ namespace TextCodec {
class Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
virtual bool validate(StringView);
virtual ErrorOr<String> to_utf8(StringView);
protected:
virtual ~Decoder() = default;
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
};
class UTF8Decoder final : public Decoder {
@ -34,16 +34,20 @@ public:
class UTF16BEDecoder final : public Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
virtual bool validate(StringView) override;
virtual ErrorOr<String> to_utf8(StringView) override;
private:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
};
class UTF16LEDecoder final : public Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
virtual bool validate(StringView) override;
virtual ErrorOr<String> to_utf8(StringView) override;
private:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
};
template<Integral ArrayType = u32>

View file

@ -34,17 +34,6 @@ TEST_CASE(test_utf16be_decode)
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
EXPECT(decoder.validate(test_string));
Vector<u32> processed_code_points;
MUST(decoder.process(test_string, [&](u32 code_point) {
return processed_code_points.try_append(code_point);
}));
EXPECT(processed_code_points.size() == 4);
EXPECT(processed_code_points[0] == 0x73);
EXPECT(processed_code_points[1] == 0xE4);
EXPECT(processed_code_points[2] == 0x6B);
EXPECT(processed_code_points[3] == 0x1F600);
auto utf8 = MUST(decoder.to_utf8(test_string));
EXPECT_EQ(utf8, "säk😀"sv);
}
@ -56,17 +45,6 @@ TEST_CASE(test_utf16le_decode)
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
EXPECT(decoder.validate(test_string));
Vector<u32> processed_code_points;
MUST(decoder.process(test_string, [&](u32 code_point) {
return processed_code_points.try_append(code_point);
}));
EXPECT(processed_code_points.size() == 4);
EXPECT(processed_code_points[0] == 0x73);
EXPECT(processed_code_points[1] == 0xE4);
EXPECT(processed_code_points[2] == 0x6B);
EXPECT(processed_code_points[3] == 0x1F600);
auto utf8 = MUST(decoder.to_utf8(test_string));
EXPECT_EQ(utf8, "säk😀"sv);
}