/* * Copyright (c) 2025, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include namespace AK { static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*)); Utf16String Utf16String::from_utf8_with_replacement_character(StringView utf8_string, WithBOMHandling with_bom_handling) { if (auto bytes = utf8_string.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } })) utf8_string = utf8_string.substring_view(3); Utf8View utf8_view { utf8_string }; if (utf8_view.validate(AllowLonelySurrogates::No)) return Utf16String::from_utf8_without_validation(utf8_string); StringBuilder builder(StringBuilder::Mode::UTF16); for (auto code_point : utf8_view) { if (is_unicode_surrogate(code_point)) builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT); else builder.append_code_point(code_point); } return builder.to_utf16_string_without_validation(); } Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string) { if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) { Utf16String string; string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf8_string.length()); auto result = utf8_string.bytes().copy_to(string.m_value.short_ascii_string.storage); VERIFY(result == utf8_string.length()); return string; } return Utf16String { Detail::Utf16StringData::from_utf8(utf8_string, Detail::Utf16StringData::AllowASCIIStorage::Yes) }; } Utf16String Utf16String::from_utf16_without_validation(Utf16View const& utf16_string) { if (utf16_string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf16_string.is_ascii()) { Utf16String string; string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf16_string.length_in_code_units()); if (utf16_string.has_ascii_storage()) { auto result = utf16_string.bytes().copy_to(string.m_value.short_ascii_string.storage); VERIFY(result == utf16_string.length_in_code_units()); } else { auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), reinterpret_cast(string.m_value.short_ascii_string.storage)); VERIFY(result == utf16_string.length_in_code_units()); } return string; } return Utf16String { Detail::Utf16StringData::from_utf16(utf16_string) }; } Utf16String Utf16String::from_utf32(Utf32View const& utf32_string) { if (utf32_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf32_string.is_ascii()) { Utf16String string; string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf32_string.length()); auto result = simdutf::convert_utf32_to_utf8(reinterpret_cast(utf32_string.code_points()), utf32_string.length(), reinterpret_cast(string.m_value.short_ascii_string.storage)); VERIFY(result == utf32_string.length()); return string; } return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) }; } Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder) { return Utf16String { Detail::Utf16StringData::from_string_builder(builder) }; } Utf16String Utf16String::repeated(u32 code_point, size_t count) { if (count <= Detail::MAX_SHORT_STRING_BYTE_COUNT && AK::is_ascii(code_point)) { Utf16String string; string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(count); Bytes bytes { string.m_value.short_ascii_string.storage, count }; bytes.fill(static_cast(code_point)); return string; } Array code_units; size_t length_in_code_units = 0; (void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) { code_units[length_in_code_units++] = code_unit; }); StringBuilder builder(StringBuilder::Mode::UTF16); builder.append_repeated({ code_units.data(), length_in_code_units }, count); return builder.to_utf16_string(); } Utf16String Utf16String::to_well_formed() const { if (utf16_view().validate(AllowLonelySurrogates::No)) return *this; return Utf16String { Detail::Utf16StringData::to_well_formed(*this) }; } String Utf16String::to_well_formed_utf8() const { if (utf16_view().validate(AllowLonelySurrogates::No)) return to_utf8(AllowLonelySurrogates::No); return to_well_formed().to_utf8(AllowLonelySurrogates::No); } ErrorOr Formatter::format(FormatBuilder& builder, Utf16String const& utf16_string) { if (utf16_string.has_long_utf16_storage()) return builder.builder().try_append(utf16_string.utf16_view()); return builder.put_string(utf16_string.ascii_view()); } }