From 1a46d8df5fc81eb2c320d5c8a5597285d3d8fb3a Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sat, 20 Jul 2024 13:05:20 +0200 Subject: [PATCH] LibTextCodec: Use String::from_utf8() when decoding UTF-8 to UTF-8 This way, we still perform UTF-8 validation, but don't go through the slow generic code path that rebuilds the decoded string one code point at a time. This was a bottleneck when loading a canned copy of reddit.com, which ended up being ~120 MiB large. - Time spent decoding UTF-8 before this change: 1192 ms - Time spent decoding UTF-8 after this change: 154 ms That's still a long time, but 7.7x faster is nothing to sneeze at! :^) Note that if the input fails UTF-8 validation, we still fall back to the slow path and insert replacement characters per the WHATWG Encoding spec: https://encoding.spec.whatwg.org/#utf-8-decode --- Userland/Libraries/LibTextCodec/Decoder.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 0fcb526e149..43d47478d54 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -372,6 +372,9 @@ ErrorOr UTF8Decoder::to_utf8(StringView input) bomless_input = input.substring_view(3); } + if (Utf8View(bomless_input).validate()) + return String::from_utf8_without_validation(bomless_input.bytes()); + return Decoder::to_utf8(bomless_input); }