AK+LibIPC: Implement an encoder/decoder for UTF-16 strings

This commit is contained in:
Timothy Flynn 2025-07-28 16:11:50 -04:00 committed by Tim Flynn
commit 13ed6aba71
Notes: github-actions[bot] 2025-08-02 17:11:30 +00:00
10 changed files with 183 additions and 11 deletions

View file

@ -4,6 +4,7 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Stream.h>
#include <AK/Utf16String.h>
#include <AK/Utf32View.h>
@ -85,6 +86,24 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
}
ErrorOr<Utf16String> Utf16String::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)
{
if (is_ascii && length_in_code_units <= Detail::MAX_SHORT_STRING_BYTE_COUNT) {
Utf16String string;
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(length_in_code_units);
Bytes bytes { string.m_value.short_ascii_string.storage, length_in_code_units };
TRY(stream.read_until_filled(bytes));
if (!StringView { bytes }.is_ascii())
return Error::from_string_literal("Stream contains invalid ASCII data");
return string;
}
return Utf16String { TRY(Detail::Utf16StringData::from_ipc_stream(stream, length_in_code_units, is_ascii)) };
}
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
{
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };

View file

@ -138,6 +138,8 @@ public:
return from_string_builder_without_validation(builder);
}
static ErrorOr<Utf16String> from_ipc_stream(Stream&, size_t length_in_code_units, bool is_ascii);
Utf16String to_well_formed() const;
String to_well_formed_utf8() const;

View file

@ -4,6 +4,7 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Stream.h>
#include <AK/TypedTransfer.h>
#include <AK/Utf16StringData.h>
#include <AK/Utf32View.h>
@ -158,6 +159,31 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilde
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
}
ErrorOr<NonnullRefPtr<Utf16StringData>> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)
{
RefPtr<Utf16StringData> string;
if (is_ascii) {
string = create_uninitialized(StorageType::ASCII, length_in_code_units);
Bytes bytes { string->m_ascii_data, length_in_code_units };
TRY(stream.read_until_filled(bytes));
if (!string->ascii_view().is_ascii())
return Error::from_string_literal("Stream contains invalid ASCII data");
} else {
string = create_uninitialized(StorageType::UTF16, length_in_code_units);
Bytes bytes { reinterpret_cast<u8*>(string->m_utf16_data), length_in_code_units * sizeof(char16_t) };
TRY(stream.read_until_filled(bytes));
if (!string->utf16_view().validate())
return Error::from_string_literal("Stream contains invalid UTF-16 data");
}
return string.release_nonnull();
}
NonnullRefPtr<Utf16StringData> Utf16StringData::to_well_formed(Utf16View const& utf16_string)
{
VERIFY(!utf16_string.has_ascii_storage());

View file

@ -34,6 +34,7 @@ public:
static NonnullRefPtr<Utf16StringData> from_utf16(Utf16View const&);
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&);
static ErrorOr<NonnullRefPtr<Utf16StringData>> from_ipc_stream(Stream&, size_t length_in_code_units, bool is_ascii);
static NonnullRefPtr<Utf16StringData> to_well_formed(Utf16View const&);

View file

@ -1,12 +1,13 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2023-2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/JsonValue.h>
#include <AK/NumericLimits.h>
#include <AK/Utf16String.h>
#include <LibCore/AnonymousBuffer.h>
#include <LibCore/DateTime.h>
#include <LibCore/Proxy.h>
@ -30,6 +31,15 @@ ErrorOr<String> decode(Decoder& decoder)
return String::from_stream(decoder.stream(), length);
}
template<>
ErrorOr<Utf16String> decode(Decoder& decoder)
{
auto is_ascii = TRY(decoder.decode<bool>());
auto length_in_code_units = TRY(decoder.decode_size());
return Utf16String::from_ipc_stream(decoder.stream(), length_in_code_units, is_ascii);
}
template<>
ErrorOr<ByteString> decode(Decoder& decoder)
{

View file

@ -1,6 +1,6 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2023-2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -88,6 +88,9 @@ ErrorOr<T> decode(Decoder& decoder)
template<>
ErrorOr<String> decode(Decoder&);
template<>
ErrorOr<Utf16String> decode(Decoder&);
template<>
ErrorOr<ByteString> decode(Decoder&);

View file

@ -1,7 +1,7 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2021, kleines Filmröllchen <filmroellchen@serenityos.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2023-2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -14,6 +14,8 @@
#include <AK/NumericLimits.h>
#include <AK/String.h>
#include <AK/Time.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h>
#include <LibCore/AnonymousBuffer.h>
#include <LibCore/DateTime.h>
#include <LibCore/Proxy.h>
@ -58,6 +60,26 @@ ErrorOr<void> encode(Encoder& encoder, StringView const& value)
return {};
}
template<>
ErrorOr<void> encode(Encoder& encoder, Utf16String const& value)
{
return encoder.encode(value.utf16_view());
}
template<>
ErrorOr<void> encode(Encoder& encoder, Utf16View const& value)
{
TRY(encoder.encode(value.has_ascii_storage()));
TRY(encoder.encode_size(value.length_in_code_units()));
if (value.has_ascii_storage())
TRY(encoder.append(value.bytes().data(), value.length_in_code_units()));
else
TRY(encoder.append(reinterpret_cast<u8 const*>(value.utf16_span().data()), value.length_in_code_units() * sizeof(char16_t)));
return {};
}
template<>
ErrorOr<void> encode(Encoder& encoder, ByteString const& value)
{

View file

@ -1,6 +1,6 @@
/*
* Copyright (c) 2018-2021, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2023-2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -87,6 +87,12 @@ ErrorOr<void> encode(Encoder&, String const&);
template<>
ErrorOr<void> encode(Encoder&, StringView const&);
template<>
ErrorOr<void> encode(Encoder&, Utf16String const&);
template<>
ErrorOr<void> encode(Encoder&, Utf16View const&);
template<>
ErrorOr<void> encode(Encoder&, ByteString const&);

View file

@ -194,6 +194,8 @@ Vector<Endpoint> parse(ByteBuffer const& file_contents)
parameter.type_for_encoding = parameter.type.replace("Vector"sv, "ReadonlySpan"sv, ReplaceMode::FirstOnly);
} else if (parameter.type.is_one_of("String"sv, "ByteString"sv)) {
parameter.type_for_encoding = "StringView"sv;
} else if (parameter.type == "Utf16String"sv) {
parameter.type_for_encoding = "Utf16View"sv;
} else if (parameter.type == "ByteBuffer"sv) {
parameter.type_for_encoding = "ReadonlyBytes"sv;
} else {
@ -518,7 +520,7 @@ private:)~~~");
message_generator.appendln("\n};");
}
void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& endpoint, Message const& message, ByteString const& name, Vector<Parameter> const& parameters, bool is_synchronous, bool is_try, bool is_utf8_string_overload = false)
void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& endpoint, Message const& message, ByteString const& name, Vector<Parameter> const& parameters, bool is_synchronous, bool is_try, bool is_unicode_string_overload = false)
{
// FIXME: For String parameters, we want to retain the property that all tranferred String objects are strictly UTF-8.
// So instead of generating a single proxy method that accepts StringView parameters, we generate two overloads.
@ -527,7 +529,7 @@ void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& e
//
// Ideally, we will eventually have separate StringView types for each of String and ByteString, where String's
// view internally provides UTF-8 guarantees. Then we won't need these overloads.
bool generate_utf8_string_overload = false;
bool generate_unicode_string_overload = false;
ByteString return_type = "void";
if (is_synchronous) {
@ -554,7 +556,7 @@ void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& e
ByteString type;
if (is_synchronous || is_try)
type = parameter.type;
else if (is_utf8_string_overload)
else if (is_unicode_string_overload)
type = make_argument_type(parameter.type);
else
type = make_argument_type(parameter.type_for_encoding);
@ -569,7 +571,7 @@ void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& e
message_generator.append(") {");
if (!is_synchronous && !is_try && !is_utf8_string_overload) {
if (!is_synchronous && !is_try && !is_unicode_string_overload) {
for (auto const& parameter : parameters) {
auto const& type = is_synchronous || is_try ? parameter.type : parameter.type_for_encoding;
@ -579,7 +581,14 @@ void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& e
argument_generator.append(R"~~~(
VERIFY(Utf8View { @argument.name@ }.validate());)~~~");
generate_utf8_string_overload = true;
generate_unicode_string_overload = true;
} else if (parameter.type == "Utf16String"sv && type == "Utf16View"sv) {
auto argument_generator = message_generator.fork();
argument_generator.set("argument.name", parameter.name);
argument_generator.append(R"~~~(
VERIFY(@argument.name@.validate());)~~~");
generate_unicode_string_overload = true;
}
}
}
@ -655,8 +664,8 @@ void generate_proxy_method(SourceGenerator& message_generator, Endpoint const& e
message_generator.appendln(R"~~~(
})~~~");
if (generate_utf8_string_overload)
generate_proxy_method(message_generator, endpoint, message, message.name, message.inputs, is_synchronous, is_try, generate_utf8_string_overload);
if (generate_unicode_string_overload)
generate_proxy_method(message_generator, endpoint, message, message.name, message.inputs, is_synchronous, is_try, generate_unicode_string_overload);
}
void do_message_for_proxy(SourceGenerator message_generator, Endpoint const& endpoint, Message const& message)

View file

@ -9,6 +9,7 @@
#include <AK/Array.h>
#include <AK/CharacterTypes.h>
#include <AK/Enumerate.h>
#include <AK/MemoryStream.h>
#include <AK/StringBuilder.h>
#include <AK/Utf16String.h>
#include <AK/Utf32View.h>
@ -411,6 +412,79 @@ TEST_CASE(repeated)
EXPECT_DEATH("Creating a string from an invalid code point", (void)Utf16String::repeated(0xffffffff, 1));
}
TEST_CASE(from_ipc_stream)
{
{
auto data = "abc"sv;
FixedMemoryStream stream { data.bytes() };
auto string = TRY_OR_FAIL(Utf16String::from_ipc_stream(stream, data.length(), true));
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 3uz);
EXPECT_EQ(string, data);
}
{
auto data = "abcdefghijklmnopqrstuvwxyz"sv;
FixedMemoryStream stream { data.bytes() };
auto string = TRY_OR_FAIL(Utf16String::from_ipc_stream(stream, data.length(), true));
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 26uz);
EXPECT_EQ(string, data);
}
{
auto data = u"hello 😀 there!"sv;
StringBuilder builder(StringBuilder::Mode::UTF16);
builder.append(data);
auto buffer = MUST(builder.to_byte_buffer());
FixedMemoryStream stream { buffer.bytes() };
auto string = TRY_OR_FAIL(Utf16String::from_ipc_stream(stream, data.length_in_code_units(), false));
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string, data);
}
{
auto data = "abc"sv;
FixedMemoryStream stream { data.bytes() };
auto result = Utf16String::from_ipc_stream(stream, data.length() + 1, true);
EXPECT(result.is_error());
}
{
auto data = u"😀"sv;
StringBuilder builder(StringBuilder::Mode::UTF16);
builder.append(data);
auto buffer = MUST(builder.to_byte_buffer());
FixedMemoryStream stream { buffer.bytes() };
auto result = Utf16String::from_ipc_stream(stream, data.length_in_code_units(), true);
EXPECT(result.is_error());
}
{
auto data = u"hello 😀 there!"sv;
StringBuilder builder(StringBuilder::Mode::UTF16);
builder.append(data);
auto buffer = MUST(builder.to_byte_buffer());
FixedMemoryStream stream { buffer.bytes() };
auto result = Utf16String::from_ipc_stream(stream, data.length_in_code_units(), true);
EXPECT(result.is_error());
}
}
TEST_CASE(to_lowercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S