mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-20 19:45:12 +00:00
AK+LibTextCodec: Stop using Utf16View endianness override
This is preparation for removing the endianness override, since it was only used by a single client: LibTextCodec. While here, add helpers and make use of simdutf for fast conversion.
This commit is contained in:
parent
7029957a6b
commit
e218cd3766
7 changed files with 56 additions and 53 deletions
|
@ -61,6 +61,36 @@ ErrorOr<String> String::from_utf8(StringView view)
|
|||
return result;
|
||||
}
|
||||
|
||||
ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
|
||||
{
|
||||
if (!validate_utf16_le(bytes))
|
||||
return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
|
||||
if (bytes.is_empty())
|
||||
return String {};
|
||||
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||
size_t utf16_length = bytes.size() / 2;
|
||||
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
|
||||
Vector<u8> buffer;
|
||||
buffer.resize(max_utf8_length);
|
||||
auto utf8_length = simdutf::convert_utf16le_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
|
||||
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
|
||||
}
|
||||
|
||||
ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
|
||||
{
|
||||
if (!validate_utf16_be(bytes))
|
||||
return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
|
||||
if (bytes.is_empty())
|
||||
return String {};
|
||||
char16_t const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||
size_t utf16_length = bytes.size() / 2;
|
||||
size_t max_utf8_length = simdutf::utf8_length_from_utf16(utf16_data, utf16_length);
|
||||
Vector<u8> buffer;
|
||||
buffer.resize(max_utf8_length);
|
||||
auto utf8_length = simdutf::convert_utf16be_to_utf8(utf16_data, utf16_length, reinterpret_cast<char*>(buffer.data()));
|
||||
return String::from_utf8_without_validation(ReadonlyBytes { buffer.data(), utf8_length });
|
||||
}
|
||||
|
||||
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||
{
|
||||
if (!utf16.validate())
|
||||
|
|
|
@ -70,6 +70,8 @@ public:
|
|||
|
||||
// Creates a new String from a sequence of UTF-16 encoded code points.
|
||||
static ErrorOr<String> from_utf16(Utf16View const&);
|
||||
static ErrorOr<String> from_utf16_le(ReadonlyBytes);
|
||||
static ErrorOr<String> from_utf16_be(ReadonlyBytes);
|
||||
|
||||
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
||||
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
||||
|
|
|
@ -421,4 +421,14 @@ size_t Utf16CodePointIterator::length_in_code_units() const
|
|||
return *(*this) < first_supplementary_plane_code_point ? 1 : 2;
|
||||
}
|
||||
|
||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
bool validate_utf16_be(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,6 +26,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host)
|
|||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
||||
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
|
||||
|
||||
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
||||
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
|
||||
|
||||
size_t utf16_code_unit_length_from_utf8(StringView);
|
||||
|
||||
class Utf16View;
|
||||
|
|
|
@ -369,25 +369,9 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
|
|||
return String::from_utf8_with_replacement_character(input);
|
||||
}
|
||||
|
||||
static Utf16View as_utf16(StringView view, AK::Endianness endianness)
|
||||
{
|
||||
return Utf16View {
|
||||
{ reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
|
||||
endianness
|
||||
};
|
||||
}
|
||||
|
||||
ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||
{
|
||||
for (auto code_point : as_utf16(input, AK::Endianness::Big))
|
||||
TRY(on_code_point(code_point));
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool UTF16BEDecoder::validate(StringView input)
|
||||
{
|
||||
return as_utf16(input, AK::Endianness::Big).validate();
|
||||
return AK::validate_utf16_be(input.bytes());
|
||||
}
|
||||
|
||||
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
||||
|
@ -396,20 +380,12 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
|||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
||||
input = input.substring_view(2);
|
||||
|
||||
return String::from_utf16(as_utf16(input, AK::Endianness::Big));
|
||||
}
|
||||
|
||||
ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||
{
|
||||
for (auto code_point : as_utf16(input, AK::Endianness::Little))
|
||||
TRY(on_code_point(code_point));
|
||||
|
||||
return {};
|
||||
return String::from_utf16_be(input.bytes());
|
||||
}
|
||||
|
||||
bool UTF16LEDecoder::validate(StringView input)
|
||||
{
|
||||
return as_utf16(input, AK::Endianness::Little).validate();
|
||||
return AK::validate_utf16_le(input.bytes());
|
||||
}
|
||||
|
||||
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
||||
|
@ -418,7 +394,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
|||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
||||
input = input.substring_view(2);
|
||||
|
||||
return String::from_utf16(as_utf16(input, AK::Endianness::Little));
|
||||
return String::from_utf16_le(input.bytes());
|
||||
}
|
||||
|
||||
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||
|
|
|
@ -17,12 +17,12 @@ namespace TextCodec {
|
|||
|
||||
class Decoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
|
||||
virtual bool validate(StringView);
|
||||
virtual ErrorOr<String> to_utf8(StringView);
|
||||
|
||||
protected:
|
||||
virtual ~Decoder() = default;
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
|
||||
};
|
||||
|
||||
class UTF8Decoder final : public Decoder {
|
||||
|
@ -34,16 +34,20 @@ public:
|
|||
|
||||
class UTF16BEDecoder final : public Decoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
||||
virtual bool validate(StringView) override;
|
||||
virtual ErrorOr<String> to_utf8(StringView) override;
|
||||
|
||||
private:
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
|
||||
};
|
||||
|
||||
class UTF16LEDecoder final : public Decoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
||||
virtual bool validate(StringView) override;
|
||||
virtual ErrorOr<String> to_utf8(StringView) override;
|
||||
|
||||
private:
|
||||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)>) override { VERIFY_NOT_REACHED(); }
|
||||
};
|
||||
|
||||
template<Integral ArrayType = u32>
|
||||
|
|
|
@ -34,17 +34,6 @@ TEST_CASE(test_utf16be_decode)
|
|||
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
||||
|
||||
EXPECT(decoder.validate(test_string));
|
||||
|
||||
Vector<u32> processed_code_points;
|
||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
||||
return processed_code_points.try_append(code_point);
|
||||
}));
|
||||
EXPECT(processed_code_points.size() == 4);
|
||||
EXPECT(processed_code_points[0] == 0x73);
|
||||
EXPECT(processed_code_points[1] == 0xE4);
|
||||
EXPECT(processed_code_points[2] == 0x6B);
|
||||
EXPECT(processed_code_points[3] == 0x1F600);
|
||||
|
||||
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||
EXPECT_EQ(utf8, "säk😀"sv);
|
||||
}
|
||||
|
@ -56,17 +45,6 @@ TEST_CASE(test_utf16le_decode)
|
|||
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
||||
|
||||
EXPECT(decoder.validate(test_string));
|
||||
|
||||
Vector<u32> processed_code_points;
|
||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
||||
return processed_code_points.try_append(code_point);
|
||||
}));
|
||||
EXPECT(processed_code_points.size() == 4);
|
||||
EXPECT(processed_code_points[0] == 0x73);
|
||||
EXPECT(processed_code_points[1] == 0xE4);
|
||||
EXPECT(processed_code_points[2] == 0x6B);
|
||||
EXPECT(processed_code_points[3] == 0x1F600);
|
||||
|
||||
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||
EXPECT_EQ(utf8, "säk😀"sv);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue