ladybird/Libraries/LibWeb/Encoding/TextEncoderStream.cpp
Luke Wilde cae0ee6fa7 LibWeb: Implement TextEncoderStream
Required by the server-side rendering mode of React Router, used by
https://chatgpt.com/

Note that the imported tests do not have the worker variants to prevent
freezing on macOS.
2025-02-07 11:04:46 -05:00

215 lines
8.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025, Luke Wilde <luke@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/UnicodeUtils.h>
#include <LibJS/Runtime/ArrayBuffer.h>
#include <LibJS/Runtime/Realm.h>
#include <LibJS/Runtime/TypedArray.h>
#include <LibWeb/Bindings/ExceptionOrUtils.h>
#include <LibWeb/Bindings/Intrinsics.h>
#include <LibWeb/Bindings/TextEncoderStreamPrototype.h>
#include <LibWeb/Encoding/TextEncoderStream.h>
#include <LibWeb/Streams/AbstractOperations.h>
#include <LibWeb/Streams/TransformStream.h>
#include <LibWeb/WebIDL/Promise.h>
namespace Web::Encoding {
GC_DEFINE_ALLOCATOR(TextEncoderStream);
// https://encoding.spec.whatwg.org/#dom-textencoderstream
WebIDL::ExceptionOr<GC::Ref<TextEncoderStream>> TextEncoderStream::construct_impl(JS::Realm& realm)
{
// 1. Set thiss encoder to an instance of the UTF-8 encoder.
// NOTE: No-op, as AK::String is already in UTF-8 format.
// NOTE: We do these steps first so that we may store it as nonnull in the GenericTransformStream.
// 4. Let transformStream be a new TransformStream.
auto transform_stream = realm.create<Streams::TransformStream>(realm);
// 6. Set this's transform to a new TransformStream.
auto stream = realm.create<TextEncoderStream>(realm, transform_stream);
// 2. Let transformAlgorithm be an algorithm which takes a chunk argument and runs the encode and enqueue a chunk
// algorithm with this and chunk.
auto transform_algorithm = GC::create_function(realm.heap(), [stream](JS::Value chunk) -> GC::Ref<WebIDL::Promise> {
auto& realm = stream->realm();
auto& vm = realm.vm();
if (auto result = stream->encode_and_enqueue_chunk(chunk); result.is_error()) {
auto throw_completion = Bindings::exception_to_throw_completion(vm, result.exception());
return WebIDL::create_rejected_promise(realm, *throw_completion.release_value());
}
return WebIDL::create_resolved_promise(realm, JS::js_undefined());
});
// 3. Let flushAlgorithm be an algorithm which runs the encode and flush algorithm with this.
auto flush_algorithm = GC::create_function(realm.heap(), [stream]() -> GC::Ref<WebIDL::Promise> {
auto& realm = stream->realm();
auto& vm = realm.vm();
if (auto result = stream->encode_and_flush(); result.is_error()) {
auto throw_completion = Bindings::exception_to_throw_completion(vm, result.exception());
return WebIDL::create_rejected_promise(realm, *throw_completion.release_value());
}
return WebIDL::create_resolved_promise(realm, JS::js_undefined());
});
// 5. Set up transformStream with transformAlgorithm set to transformAlgorithm and flushAlgorithm set to flushAlgorithm.
transform_stream->set_up(transform_algorithm, flush_algorithm);
return stream;
}
TextEncoderStream::TextEncoderStream(JS::Realm& realm, GC::Ref<Streams::TransformStream> transform)
: Bindings::PlatformObject(realm)
, Streams::GenericTransformStreamMixin(transform)
{
}
TextEncoderStream::~TextEncoderStream() = default;
void TextEncoderStream::initialize(JS::Realm& realm)
{
Base::initialize(realm);
WEB_SET_PROTOTYPE_FOR_INTERFACE(TextEncoderStream);
}
void TextEncoderStream::visit_edges(JS::Cell::Visitor& visitor)
{
Base::visit_edges(visitor);
Streams::GenericTransformStreamMixin::visit_edges(visitor);
}
// https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
WebIDL::ExceptionOr<void> TextEncoderStream::encode_and_enqueue_chunk(JS::Value chunk)
{
// Spec Note: This is equivalent to the "convert a string into a scalar value string" algorithm from the Infra
// Standard, but allows for surrogate pairs that are split between strings. [INFRA]
auto& realm = this->realm();
auto& vm = this->vm();
// 1. Let input be the result of converting chunk to a DOMString.
auto input = TRY(chunk.to_string(vm));
// 2. Convert input to an I/O queue of code units.
// Spec Note: DOMString, as well as an I/O queue of code units rather than scalar values, are used here so that a
// surrogate pair that is split between chunks can be reassembled into the appropriate scalar value.
// The behavior is otherwise identical to USVString. In particular, lone surrogates will be replaced
// with U+FFFD.
auto code_points = input.code_points();
auto it = code_points.begin();
// 3. Let output be the I/O queue of bytes « end-of-queue ».
ByteBuffer output;
// 4. While true:
while (true) {
// 2. If item is end-of-queue, then:
// NOTE: This is done out-of-order so that we're not dereferencing a code point iterator that points to the end.
if (it.done()) {
// 1. Convert output into a byte sequence.
// Note: No-op.
// 2. If output is non-empty, then:
if (!output.is_empty()) {
// 1. Let chunk be a Uint8Array object wrapping an ArrayBuffer containing output.
auto array_buffer = JS::ArrayBuffer::create(realm, move(output));
auto array = JS::Uint8Array::create(realm, array_buffer->byte_length(), *array_buffer);
// 2. Enqueue chunk into encoders transform.
TRY(Streams::transform_stream_default_controller_enqueue(*m_transform->controller(), array));
}
// 3. Return.
return {};
}
// 1. Let item be the result of reading from input.
auto item = *it;
// 3. Let result be the result of executing the convert code unit to scalar value algorithm with encoder, item and input.
auto result = convert_code_unit_to_scalar_value(item, it);
// 4. If result is not continue, then process an item with result, encoders encoder, input, output, and "fatal".
if (result.has_value()) {
(void)AK::UnicodeUtils::code_point_to_utf8(result.value(), [&output](char utf8_byte) {
output.append(static_cast<u8>(utf8_byte));
});
}
}
}
// https://encoding.spec.whatwg.org/#encode-and-flush
WebIDL::ExceptionOr<void> TextEncoderStream::encode_and_flush()
{
auto& realm = this->realm();
// 1. If encoders leading surrogate is non-null, then:
if (m_leading_surrogate.has_value()) {
// 1. Let chunk be a Uint8Array object wrapping an ArrayBuffer containing 0xEF 0xBF 0xBD.
// Spec Note: This is U+FFFD (<28>) in UTF-8 bytes.
constexpr static u8 replacement_character_utf8_bytes[3] = { 0xEF, 0xBF, 0xBD };
auto bytes = MUST(ByteBuffer::copy(replacement_character_utf8_bytes, sizeof(replacement_character_utf8_bytes)));
auto array_buffer = JS::ArrayBuffer::create(realm, bytes);
auto chunk = JS::Uint8Array::create(realm, array_buffer->byte_length(), *array_buffer);
// 2. Enqueue chunk into encoders transform.
TRY(Streams::transform_stream_default_controller_enqueue(*m_transform->controller(), chunk));
}
return {};
}
// https://encoding.spec.whatwg.org/#convert-code-unit-to-scalar-value
Optional<u32> TextEncoderStream::convert_code_unit_to_scalar_value(u32 item, Utf8CodePointIterator& code_point_iterator)
{
ArmedScopeGuard move_to_next_code_point_guard = [&] {
++code_point_iterator;
};
// 1. If encoders leading surrogate is non-null, then:
if (m_leading_surrogate.has_value()) {
// 1. Let leadingSurrogate be encoders leading surrogate.
auto leading_surrogate = m_leading_surrogate.value();
// 2. Set encoders leading surrogate to null.
m_leading_surrogate.clear();
// 3. If item is a trailing surrogate, then return a scalar value from surrogates given leadingSurrogate
// and item.
if (Utf16View::is_low_surrogate(item)) {
// https://encoding.spec.whatwg.org/#scalar-value-from-surrogates
// To obtain a scalar value from surrogates, given a leading surrogate leading and a trailing surrogate
// trailing, return 0x10000 + ((leading 0xD800) << 10) + (trailing 0xDC00).
return Utf16View::decode_surrogate_pair(leading_surrogate, item);
}
// 4. Restore item to input.
move_to_next_code_point_guard.disarm();
// 5. Return U+FFFD.
return 0xFFFD;
}
// 2. If item is a leading surrogate, then set encoders leading surrogate to item and return continue.
if (Utf16View::is_high_surrogate(item)) {
m_leading_surrogate = item;
return OptionalNone {};
}
// 3. If item is a trailing surrogate, then return U+FFFD.
if (Utf16View::is_low_surrogate(item))
return 0xFFFD;
// 4. Return item.
return item;
}
}