LibTextCodec+LibURL: Implement utf-8 and euc-jp encoders

Implements the corresponding encoders, selects the appropriate one when
encoding URL search params. If an encoder for the given encoding could
not be found, fallback to utf-8.
This commit is contained in:
BenJilks 2024-08-05 16:03:53 +01:00 committed by Tim Ledbetter
commit 72d0e3284b
Notes: github-actions[bot] 2024-08-08 16:51:38 +00:00
11 changed files with 260 additions and 22 deletions

View file

@ -14,6 +14,8 @@
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibTextCodec/Encoder.h>
#include <LibURL/Parser.h>
#include <LibUnicode/IDNA.h>
@ -768,18 +770,17 @@ void Parser::shorten_urls_path(URL& url)
}
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
ErrorOr<String> Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
{
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
// 1. Let encodeOutput be an empty I/O queue.
StringBuilder output;
// 3. For each byte of encodeOutput converted to a byte sequence:
for (u8 byte : input) {
TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr<void> {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if (space_as_plus && byte == ' ') {
output.append('+');
continue;
return {};
}
// 2. Let isomorph be a code point whose value is bytes value.
@ -796,7 +797,9 @@ ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentE
else {
output.appendff("%{:02X}", byte);
}
}
return {};
}));
// 6. Return output.
return output.to_string();
@ -851,7 +854,9 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
// 4. Let state be state override if given, or scheme start state otherwise.
State state = state_override.value_or(State::SchemeStart);
// FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
// 5. Set encoding to the result of getting an output encoding from encoding.
auto encoder = TextCodec::encoder_for("utf-8"sv);
VERIFY(encoder.has_value());
// 6. Let buffer be the empty string.
StringBuilder buffer;
@ -1684,7 +1689,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query;
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to urls query.
url->m_data->query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
// 3. Set buffer to the empty string.
buffer.clear();
@ -1726,7 +1731,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
// NOTE: The percent-encode is done on EOF on the entire buffer.
buffer.append_code_point(code_point);
} else {
url->m_data->fragment = percent_encode_after_encoding(buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
url->m_data->fragment = percent_encode_after_encoding(*encoder, buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
buffer.clear();
}
break;