/* * Copyright (c) 2021-2025, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include TEST_CASE(decode_ascii) { auto string = MUST(AK::utf8_to_utf16("Hello World!11"sv)); Utf16View view { string }; size_t valid_code_units = 0; EXPECT(view.validate(valid_code_units)); EXPECT_EQ(valid_code_units, view.length_in_code_units()); auto expected = Array { (u32)72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } TEST_CASE(decode_utf8) { auto string = MUST(AK::utf8_to_utf16("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv)); Utf16View view { string }; size_t valid_code_units = 0; EXPECT(view.validate(valid_code_units)); EXPECT_EQ(valid_code_units, view.length_in_code_units()); auto expected = Array { (u32)1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } TEST_CASE(encode_utf8) { { auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string; auto string = MUST(AK::utf8_to_utf16(utf8_string)); Utf16View view { string }; EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), utf8_string); EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), utf8_string); } { Utf16View view { u"\xd83d"sv }; EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), "\xed\xa0\xbd"sv); EXPECT(view.to_utf8(AllowLonelySurrogates::No).is_error()); } } TEST_CASE(decode_utf16) { Utf16View view { u"Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv }; EXPECT_EQ(view.length_in_code_units(), 39uz); size_t valid_code_units = 0; EXPECT(view.validate(valid_code_units)); EXPECT_EQ(valid_code_units, view.length_in_code_units()); auto expected = Array { (u32)1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } TEST_CASE(utf16_code_unit_length_from_utf8) { EXPECT_EQ(AK::utf16_code_unit_length_from_utf8(""sv), 0uz); EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("abc"sv), 3uz); EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("😀"sv), 2uz); EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv), 39uz); } TEST_CASE(null_view) { Utf16View view; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 0zu); EXPECT_EQ(view.length_in_code_points(), 0zu); EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), ""sv); EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), ""sv); for ([[maybe_unused]] auto it : view) FAIL("Iterating a null UTF-16 string should not produce any values"); } TEST_CASE(utf16_literal) { { Utf16View view { u""sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 0u); } { Utf16View view { u"a"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 1u); EXPECT_EQ(view.code_unit_at(0), 0x61u); } { Utf16View view { u"abc"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 3u); EXPECT_EQ(view.code_unit_at(0), 0x61u); EXPECT_EQ(view.code_unit_at(1), 0x62u); EXPECT_EQ(view.code_unit_at(2), 0x63u); } { Utf16View view { u"🙃"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 2u); EXPECT_EQ(view.code_unit_at(0), 0xd83du); EXPECT_EQ(view.code_unit_at(1), 0xde43u); } } TEST_CASE(iterate_utf16) { auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv)); Utf16View view { string }; auto iterator = view.begin(); EXPECT(*iterator == 1055); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 1088); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 1080); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 1074); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 1077); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 1090); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 32); EXPECT(iterator.length_in_code_units() == 1); EXPECT(++iterator != view.end()); EXPECT(*iterator == 128512); EXPECT(iterator.length_in_code_units() == 2); EXPECT(++iterator == view.end()); EXPECT_DEATH("Dereferencing Utf16CodePointIterator which is at its end.", *iterator); EXPECT_DEATH("Incrementing Utf16CodePointIterator which is at its end.", ++iterator); } TEST_CASE(validate_invalid_utf16) { size_t valid_code_units = 0; Utf16View invalid; { // Lonely high surrogate. invalid = u"\xd800"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 1uz); invalid = u"\xdbff"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 1uz); } { // Lonely low surrogate. invalid = u"\xdc00"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 1uz); invalid = u"\xdfff"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 1uz); } { // High surrogate followed by non-surrogate. invalid = u"\xd800\x0000"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 2uz); invalid = u"\xd800\xe000"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 2uz); } { // High surrogate followed by high surrogate. invalid = u"\xd800\xd800"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 2uz); invalid = u"\xd800\xdbff"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 2uz); } { // Valid UTF-16 followed by invalid code units. invalid = u"\x0041\x0041\xd800"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 2uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 3uz); invalid = u"\x0041\x0041\xd800"sv; EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No)); EXPECT_EQ(valid_code_units, 2uz); EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes)); EXPECT_EQ(valid_code_units, 3uz); } } TEST_CASE(decode_invalid_utf16) { { // Lonely high surrogate. Utf16View view { u"AB\xd800"sv }; EXPECT_EQ(view.length_in_code_units(), 3uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } { // Lonely low surrogate. Utf16View view { u"AB\xdc00"sv }; EXPECT_EQ(view.length_in_code_units(), 3uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } { // High surrogate followed by non-surrogate. Utf16View view { u"AB\xd800\x0000"sv }; EXPECT_EQ(view.length_in_code_units(), 4uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } { // High surrogate followed by high surrogate. Utf16View view { u"AB\xd800\xd800"sv }; EXPECT_EQ(view.length_in_code_units(), 4uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); size_t i = 0; for (u32 code_point : view) { EXPECT_EQ(code_point, expected[i++]); } EXPECT_EQ(i, expected.size()); } } TEST_CASE(is_ascii) { EXPECT(Utf16View {}.is_ascii()); EXPECT(u"a"sv.is_ascii()); EXPECT(u"foo"sv.is_ascii()); EXPECT(u"foo\t\n\rbar\v\b123"sv.is_ascii()); EXPECT(u"The quick (\"brown\") fox can't jump 32.3 feet, right?"sv.is_ascii()); EXPECT(!u"😀"sv.is_ascii()); EXPECT(!u"foo 😀"sv.is_ascii()); EXPECT(!u"😀 foo"sv.is_ascii()); EXPECT(!u"The quick (“brown”) fox can’t jump 32.3 feet, right?"sv.is_ascii()); } TEST_CASE(to_ascii_lowercase) { EXPECT_EQ(u""sv.to_ascii_lowercase(), u""sv); EXPECT_EQ(u"foobar"sv.to_ascii_lowercase(), u"foobar"sv); EXPECT_EQ(u"FooBar"sv.to_ascii_lowercase(), u"foobar"sv); EXPECT_EQ(u"FOOBAR"sv.to_ascii_lowercase(), u"foobar"sv); EXPECT_EQ(u"FOO 😀 BAR"sv.to_ascii_lowercase(), u"foo 😀 bar"sv); } TEST_CASE(to_ascii_uppercase) { EXPECT_EQ(u""sv.to_ascii_uppercase(), u""sv); EXPECT_EQ(u"foobar"sv.to_ascii_uppercase(), u"FOOBAR"sv); EXPECT_EQ(u"FooBar"sv.to_ascii_uppercase(), u"FOOBAR"sv); EXPECT_EQ(u"FOOBAR"sv.to_ascii_uppercase(), u"FOOBAR"sv); EXPECT_EQ(u"foo 😀 bar"sv.to_ascii_uppercase(), u"FOO 😀 BAR"sv); } TEST_CASE(to_ascii_titlecase) { EXPECT_EQ(u""sv.to_ascii_titlecase(), u""sv); EXPECT_EQ(u"foobar"sv.to_ascii_titlecase(), u"Foobar"sv); EXPECT_EQ(u"FooBar"sv.to_ascii_titlecase(), u"Foobar"sv); EXPECT_EQ(u"foo bar"sv.to_ascii_titlecase(), u"Foo Bar"sv); EXPECT_EQ(u"FOO BAR"sv.to_ascii_titlecase(), u"Foo Bar"sv); EXPECT_EQ(u"foo 😀 bar"sv.to_ascii_titlecase(), u"Foo 😀 Bar"sv); } TEST_CASE(equals_ignoring_case) { auto string1 = MUST(AK::utf8_to_utf16("foobar"sv)); auto string2 = MUST(AK::utf8_to_utf16("FooBar"sv)); EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); string1 = MUST(AK::utf8_to_utf16(""sv)); string2 = MUST(AK::utf8_to_utf16(""sv)); EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); string1 = MUST(AK::utf8_to_utf16(""sv)); string2 = MUST(AK::utf8_to_utf16("FooBar"sv)); EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); } TEST_CASE(replace) { auto result = u""sv.replace({}, {}, ReplaceMode::FirstOnly); EXPECT_EQ(result, u""sv); result = u""sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u""sv); result = u"foo"sv.replace(u"bar"sv, u"baz"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u"foo"sv); result = u"foo"sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u"bar"sv); result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u"feo"sv); result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); EXPECT_EQ(result, u"fee"sv); result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u"feo boo"sv); result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); EXPECT_EQ(result, u"fee bee"sv); result = u"foo 😀 boo 😀"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); EXPECT_EQ(result, u"fee 😀 bee 😀"sv); result = u"foo 😀 boo 😀"sv.replace(u"😀"sv, u"🙃"sv, ReplaceMode::FirstOnly); EXPECT_EQ(result, u"foo 🙃 boo 😀"sv); result = u"foo 😀 boo 😀"sv.replace(u"😀"sv, u"🙃"sv, ReplaceMode::All); EXPECT_EQ(result, u"foo 🙃 boo 🙃"sv); result = u"foo 😀 boo 😀"sv.replace(u"😀 "sv, u"🙃 "sv, ReplaceMode::All); EXPECT_EQ(result, u"foo 🙃 boo 😀"sv); } TEST_CASE(substring_view) { auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv)); { Utf16View view { string }; view = view.substring_view(7, 2); EXPECT(view.length_in_code_units() == 2); EXPECT_EQ(MUST(view.to_utf8()), "😀"sv); } { Utf16View view { string }; view = view.substring_view(7, 1); EXPECT(view.length_in_code_units() == 1); EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), "\xed\xa0\xbd"sv); EXPECT(view.to_utf8(AllowLonelySurrogates::No).is_error()); } } TEST_CASE(trim) { Utf16View whitespace { u" "sv }; { Utf16View view { u"word"sv }; EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv); } { Utf16View view { u" word"sv }; EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv); } { Utf16View view { u"word "sv }; EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv); } { Utf16View view { u" word "sv }; EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv); } { Utf16View view { u" \u180E "sv }; EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"\u180E"sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"\u180E "sv); EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" \u180E"sv); } { Utf16View view { u"😀wfh😀"sv }; EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Both), u"wfh"sv); EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Left), u"wfh😀"sv); EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Right), u"😀wfh"sv); } } TEST_CASE(contains) { EXPECT(!u""sv.contains(u'a')); EXPECT(u"a"sv.contains(u'a')); EXPECT(!u"b"sv.contains(u'a')); EXPECT(u"ab"sv.contains(u'a')); EXPECT(u"😀"sv.contains(u'\xd83d')); EXPECT(u"😀"sv.contains(u'\xde00')); EXPECT(u""sv.contains(u""sv)); EXPECT(!u""sv.contains(u"a"sv)); EXPECT(u"a"sv.contains(u"a"sv)); EXPECT(!u"b"sv.contains(u"a"sv)); EXPECT(u"ab"sv.contains(u"a"sv)); EXPECT(u"😀"sv.contains(u"\xd83d"sv)); EXPECT(u"😀"sv.contains(u"\xde00"sv)); EXPECT(u"😀"sv.contains(u"😀"sv)); EXPECT(u"ab😀"sv.contains(u"😀"sv)); } TEST_CASE(starts_with) { EXPECT(Utf16View {}.starts_with(u""sv)); EXPECT(!Utf16View {}.starts_with(u" "sv)); EXPECT(u"a"sv.starts_with(u""sv)); EXPECT(u"a"sv.starts_with(u"a"sv)); EXPECT(!u"a"sv.starts_with(u"b"sv)); EXPECT(!u"a"sv.starts_with(u"ab"sv)); EXPECT(u"abc"sv.starts_with(u""sv)); EXPECT(u"abc"sv.starts_with(u"a"sv)); EXPECT(u"abc"sv.starts_with(u"ab"sv)); EXPECT(u"abc"sv.starts_with(u"abc"sv)); EXPECT(!u"abc"sv.starts_with(u"b"sv)); EXPECT(!u"abc"sv.starts_with(u"bc"sv)); auto emoji = u"😀🙃"sv; EXPECT(emoji.starts_with(u""sv)); EXPECT(emoji.starts_with(u"😀"sv)); EXPECT(emoji.starts_with(u"😀🙃"sv)); EXPECT(!emoji.starts_with(u"a"sv)); EXPECT(!emoji.starts_with(u"🙃"sv)); } TEST_CASE(find_code_unit_offset) { auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv)); Utf16View const view { conversion_result }; EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value()); EXPECT_EQ(4u, view.find_code_unit_offset(u""sv, 4).value()); EXPECT(!view.find_code_unit_offset(u""sv, 16).has_value()); EXPECT_EQ(0u, view.find_code_unit_offset(u"😀"sv).value()); EXPECT_EQ(5u, view.find_code_unit_offset(u"😀"sv, 1).value()); EXPECT_EQ(2u, view.find_code_unit_offset(u"foo"sv).value()); EXPECT_EQ(7u, view.find_code_unit_offset(u"bar"sv).value()); EXPECT(!view.find_code_unit_offset(u"baz"sv).has_value()); } TEST_CASE(find_code_unit_offset_ignoring_case) { auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv)); Utf16View const view { conversion_result }; EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value()); EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u""sv, 4).value()); EXPECT(!view.find_code_unit_offset_ignoring_case(u""sv, 16).has_value()); EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀"sv).value()); EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀"sv, 1).value()); EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO"sv).value()); EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR"sv).value()); EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz"sv).has_value()); }