AK: Support non-native endianness in Utf16View

Utf16View currently assumes host endianness. Add support for specifying
either big or little endianness (which we mostly just pipe through to
simdutf). This will allow using simdutf facilities with LibTextCodec.
This commit is contained in:
Timothy Flynn 2024-07-18 11:57:01 -04:00 committed by Andreas Kling
commit 71c29504af
Notes: sideshowbarker 2024-07-19 06:22:46 +09:00
5 changed files with 228 additions and 79 deletions

View file

@ -7,6 +7,7 @@
#pragma once
#include <AK/ByteString.h>
#include <AK/Endian.h>
#include <AK/Error.h>
#include <AK/Format.h>
#include <AK/Forward.h>
@ -20,10 +21,10 @@ namespace AK {
using Utf16Data = Vector<u16, 1>;
ErrorOr<Utf16Data> utf8_to_utf16(StringView);
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&);
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&);
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32);
ErrorOr<Utf16Data> utf8_to_utf16(StringView, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
class Utf16View;
@ -45,14 +46,16 @@ public:
size_t length_in_code_units() const;
private:
Utf16CodePointIterator(u16 const* ptr, size_t length)
Utf16CodePointIterator(u16 const* ptr, size_t length, Endianness endianness)
: m_ptr(ptr)
, m_remaining_code_units(length)
, m_endianness(endianness)
{
}
u16 const* m_ptr { nullptr };
size_t m_remaining_code_units { 0 };
Endianness m_endianness { Endianness::Host };
};
class Utf16View {
@ -66,16 +69,18 @@ public:
Utf16View() = default;
~Utf16View() = default;
explicit Utf16View(ReadonlySpan<u16> code_units)
explicit Utf16View(ReadonlySpan<u16> code_units, Endianness endianness = Endianness::Host)
: m_code_units(code_units)
, m_endianness(endianness)
{
}
template<size_t Size>
Utf16View(char16_t const (&code_units)[Size])
Utf16View(char16_t const (&code_units)[Size], Endianness endianness = Endianness::Host)
: m_code_units(
reinterpret_cast<u16 const*>(&code_units[0]),
code_units[Size - 1] == u'\0' ? Size - 1 : Size)
, m_endianness(endianness)
{
}
@ -94,10 +99,14 @@ public:
size_t length_in_code_units() const { return m_code_units.size(); }
size_t length_in_code_points() const;
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; }
Utf16CodePointIterator end() const { return { end_ptr(), 0 }; }
Endianness endianness() const { return m_endianness; }
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size(), m_endianness }; }
Utf16CodePointIterator end() const { return { end_ptr(), 0, m_endianness }; }
u16 const* data() const { return m_code_units.data(); }
char16_t const* char_data() const { return reinterpret_cast<char16_t const*>(data()); }
u16 code_unit_at(size_t index) const;
u32 code_point_at(size_t index) const;
@ -126,6 +135,7 @@ private:
ReadonlySpan<u16> m_code_units;
mutable Optional<size_t> m_length_in_code_points;
Endianness m_endianness { Endianness::Host };
};
}