mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-30 12:49:19 +00:00
LibWeb: Skip CSS tokenizer filtering when string has no '\r' or '\f'
When loading a canned version of reddit.com, we end up parsing many many shadow tree style sheets of roughly ~170 KiB text each. None of them have '\r' or '\f', yet we spend 2-3 ms for each sheet just looping over and reconstructing the text to see if we need to normalize any newlines. This patch makes the common case faster in two ways: - We use TextCodec::Decoder::to_utf8() instead of process() This way, we do a one-shot fast validation and conversion to UTF-8, instead of using the generic code-point-at-a-time callback API. - We scan for '\r' and '\f' before filtering, and if neither is present, we simply use the unfiltered string. With these changes, we now spend 0 ms in the filtering function for the vast majority of style sheets I've seen so far.
This commit is contained in:
parent
7892ee355d
commit
dba6216caa
Notes:
github-actions[bot]
2024-07-20 13:36:31 +00:00
Author: https://github.com/awesomekling
Commit: dba6216caa
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/735
Reviewed-by: https://github.com/AtkinsSJ ✅
1 changed files with 16 additions and 3 deletions
|
@ -202,11 +202,25 @@ ErrorOr<Vector<Token>> Tokenizer::tokenize(StringView input, StringView encoding
|
|||
auto decoder = TextCodec::decoder_for(encoding);
|
||||
VERIFY(decoder.has_value());
|
||||
|
||||
auto decoded_input = TRY(decoder->to_utf8(input));
|
||||
|
||||
// OPTIMIZATION: If the input doesn't contain any CR or FF, we can skip the filtering
|
||||
bool const contains_cr_or_ff = [&] {
|
||||
for (auto byte : decoded_input.bytes()) {
|
||||
if (byte == '\r' || byte == '\f')
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}();
|
||||
if (!contains_cr_or_ff) {
|
||||
return decoded_input;
|
||||
}
|
||||
|
||||
StringBuilder builder { input.length() };
|
||||
bool last_was_carriage_return = false;
|
||||
|
||||
// To filter code points from a stream of (unfiltered) code points input:
|
||||
TRY(decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> {
|
||||
for (auto code_point : decoded_input.code_points()) {
|
||||
// Replace any U+000D CARRIAGE RETURN (CR) code points,
|
||||
// U+000C FORM FEED (FF) code points,
|
||||
// or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
|
||||
|
@ -236,8 +250,7 @@ ErrorOr<Vector<Token>> Tokenizer::tokenize(StringView input, StringView encoding
|
|||
|
||||
last_was_carriage_return = false;
|
||||
}
|
||||
return {};
|
||||
}));
|
||||
}
|
||||
return builder.to_string_without_validation();
|
||||
};
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue