From bfc9dc447f42eb9d7d0927721291d1b528b3125d Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 15 Jul 2024 15:25:08 -0400 Subject: [PATCH] AK+LibWeb: Replace our home-grown base64 encoder/decoders with simdutf We currently have 2 base64 coders: one in AK, another in LibWeb for a "forgiving" implementation. ECMA-262 has an upcoming proposal which will require a third implementation. Instead, let's use the base64 implementation that is used by Node.js and recommended by the upcoming proposal. It handles forgiving decoding as well. Our users of AK's implementation should be fine with the forgiving implementation. The AK impl originally had naive forgiving behavior, but that was removed solely for performance reasons. Using http://mattmahoney.net/dc/enwik8.zip (100MB unzipped) as a test, performance of our old home-grown implementations vs. the simdutf implementation (on Linux x64): Encode Decode AK base64 0.226s 0.169s LibWeb base64 N/A 1.244s simdutf 0.161s 0.047s --- AK/Base64.cpp | 128 +++++------------- AK/Base64.h | 56 +------- AK/CMakeLists.txt | 10 +- Tests/AK/TestBase64.cpp | 17 +-- Userland/Libraries/LibWeb/CMakeLists.txt | 1 - .../LibWeb/Fetch/Infrastructure/URL.cpp | 4 +- Userland/Libraries/LibWeb/HTML/Window.cpp | 2 - .../LibWeb/HTML/WindowOrWorkerGlobalScope.cpp | 5 +- Userland/Libraries/LibWeb/Infra/Base64.cpp | 123 ----------------- Userland/Libraries/LibWeb/Infra/Base64.h | 15 -- vcpkg.json | 9 +- 11 files changed, 60 insertions(+), 310 deletions(-) delete mode 100644 Userland/Libraries/LibWeb/Infra/Base64.cpp delete mode 100644 Userland/Libraries/LibWeb/Infra/Base64.h diff --git a/AK/Base64.cpp b/AK/Base64.cpp index a8fa7f8c8b3..7a579440649 100644 --- a/AK/Base64.cpp +++ b/AK/Base64.cpp @@ -4,116 +4,51 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include +#define AK_DONT_REPLACE_STD + #include -#include -#include #include #include +#include + namespace AK { -size_t calculate_base64_decoded_length(StringView input) +static ErrorOr decode_base64_impl(StringView input, simdutf::base64_options options) { - auto length = input.length() * 3 / 4; - - if (input.ends_with("="sv)) - --length; - if (input.ends_with("=="sv)) - --length; - - return length; -} - -size_t calculate_base64_encoded_length(ReadonlyBytes input) -{ - return ((4 * input.size() / 3) + 3) & ~3; -} - -static ErrorOr decode_base64_impl(StringView input, ReadonlySpan alphabet_lookup_table) -{ - input = input.trim_whitespace(); - - if (input.length() % 4 != 0) - return Error::from_string_literal("Invalid length of Base64 encoded string"); - - auto get = [&](size_t offset, bool* is_padding) -> ErrorOr { - if (offset >= input.length()) - return 0; - - auto ch = static_cast(input[offset]); - if (ch == '=') { - if (!is_padding) - return Error::from_string_literal("Invalid '=' character outside of padding in base64 data"); - *is_padding = true; - return 0; - } - - i16 result = alphabet_lookup_table[ch]; - if (result < 0) - return Error::from_string_literal("Invalid character in base64 data"); - VERIFY(result < 256); - return { result }; - }; - ByteBuffer output; - TRY(output.try_resize(calculate_base64_decoded_length(input))); + TRY(output.try_resize(simdutf::maximal_binary_length_from_base64(input.characters_without_null_termination(), input.length()))); - size_t input_offset = 0; - size_t output_offset = 0; + auto result = simdutf::base64_to_binary( + input.characters_without_null_termination(), + input.length(), + reinterpret_cast(output.data()), + options); - while (input_offset < input.length()) { - bool in2_is_padding = false; - bool in3_is_padding = false; - - u8 const in0 = TRY(get(input_offset++, nullptr)); - u8 const in1 = TRY(get(input_offset++, nullptr)); - u8 const in2 = TRY(get(input_offset++, &in2_is_padding)); - u8 const in3 = TRY(get(input_offset++, &in3_is_padding)); - - output[output_offset++] = (in0 << 2) | ((in1 >> 4) & 3); - - if (!in2_is_padding) - output[output_offset++] = ((in1 & 0xf) << 4) | ((in2 >> 2) & 0xf); - - if (!in3_is_padding) - output[output_offset++] = ((in2 & 0x3) << 6) | in3; - } + if (result.error != simdutf::SUCCESS) + return Error::from_string_literal("Invalid base64-encoded data"); + output.resize(result.count); return output; } -static ErrorOr encode_base64_impl(ReadonlyBytes input, ReadonlySpan alphabet) +static ErrorOr encode_base64_impl(StringView input, simdutf::base64_options options) { Vector output; - TRY(output.try_ensure_capacity(calculate_base64_encoded_length(input))); - auto get = [&](size_t const offset, bool* need_padding = nullptr) -> u8 { - if (offset >= input.size()) { - if (need_padding) - *need_padding = true; - return 0; - } - return input[offset]; - }; + // simdutf does not append padding to base64url encodings. We use the default encoding option here to allocate room + // for the padding characters that we will later append ourselves if necessary. + TRY(output.try_resize(simdutf::base64_length_from_binary(input.length(), simdutf::base64_default))); - for (size_t i = 0; i < input.size(); i += 3) { - bool is_8bit = false; - bool is_16bit = false; + auto size_written = simdutf::binary_to_base64( + input.characters_without_null_termination(), + input.length(), + reinterpret_cast(output.data()), + options); - u8 const in0 = get(i); - u8 const in1 = get(i + 1, &is_16bit); - u8 const in2 = get(i + 2, &is_8bit); - - u8 const index0 = (in0 >> 2) & 0x3f; - u8 const index1 = ((in0 << 4) | (in1 >> 4)) & 0x3f; - u8 const index2 = ((in1 << 2) | (in2 >> 6)) & 0x3f; - u8 const index3 = in2 & 0x3f; - - output.unchecked_append(alphabet[index0]); - output.unchecked_append(alphabet[index1]); - output.unchecked_append(is_16bit ? '=' : alphabet[index2]); - output.unchecked_append(is_8bit ? '=' : alphabet[index3]); + if (options == simdutf::base64_url) { + for (size_t i = size_written; i < output.size(); ++i) + output[i] = '='; } return String::from_utf8_without_validation(output); @@ -121,23 +56,22 @@ static ErrorOr encode_base64_impl(ReadonlyBytes input, ReadonlySpan decode_base64(StringView input) { - static constexpr auto lookup_table = base64_lookup_table(); - return decode_base64_impl(input, lookup_table); + return decode_base64_impl(input, simdutf::base64_default); } ErrorOr decode_base64url(StringView input) { - static constexpr auto lookup_table = base64url_lookup_table(); - return decode_base64_impl(input, lookup_table); + return decode_base64_impl(input, simdutf::base64_url); } ErrorOr encode_base64(ReadonlyBytes input) { - return encode_base64_impl(input, base64_alphabet); + return encode_base64_impl(input, simdutf::base64_default); } + ErrorOr encode_base64url(ReadonlyBytes input) { - return encode_base64_impl(input, base64url_alphabet); + return encode_base64_impl(input, simdutf::base64_url); } } diff --git a/AK/Base64.h b/AK/Base64.h index f9f29bab6c9..73761cceaa2 100644 --- a/AK/Base64.h +++ b/AK/Base64.h @@ -6,7 +6,6 @@ #pragma once -#include #include #include #include @@ -14,59 +13,12 @@ namespace AK { -// https://datatracker.ietf.org/doc/html/rfc4648#section-4 -constexpr Array base64_alphabet = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', - 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', - 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', - 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '+', '/' -}; +ErrorOr decode_base64(StringView); +ErrorOr decode_base64url(StringView); -// https://datatracker.ietf.org/doc/html/rfc4648#section-5 -constexpr Array base64url_alphabet = { - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', - 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', - 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', - 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', - 'w', 'x', 'y', 'z', '0', '1', '2', '3', - '4', '5', '6', '7', '8', '9', '-', '_' -}; +ErrorOr encode_base64(ReadonlyBytes); +ErrorOr encode_base64url(ReadonlyBytes); -consteval auto base64_lookup_table() -{ - Array table; - table.fill(-1); - for (size_t i = 0; i < base64_alphabet.size(); ++i) { - table[base64_alphabet[i]] = static_cast(i); - } - return table; -} - -consteval auto base64url_lookup_table() -{ - Array table; - table.fill(-1); - for (size_t i = 0; i < base64url_alphabet.size(); ++i) { - table[base64url_alphabet[i]] = static_cast(i); - } - return table; -} - -[[nodiscard]] size_t calculate_base64_decoded_length(StringView); - -[[nodiscard]] size_t calculate_base64_encoded_length(ReadonlyBytes); - -[[nodiscard]] ErrorOr decode_base64(StringView); -[[nodiscard]] ErrorOr decode_base64url(StringView); - -[[nodiscard]] ErrorOr encode_base64(ReadonlyBytes); -[[nodiscard]] ErrorOr encode_base64url(ReadonlyBytes); } #if USING_AK_GLOBALLY diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index f566509651c..1f8bbc0b940 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -1,6 +1,5 @@ set(SOURCES Assertions.cpp - Base64.cpp CircularBuffer.cpp ConstrainedStream.cpp CountingStream.cpp @@ -38,6 +37,10 @@ set(SOURCES kmalloc.cpp ) +if (NOT LAGOM_TOOLS_ONLY) + list(APPEND SOURCES Base64.cpp) +endif() + serenity_lib(AK ak) serenity_install_headers(AK) @@ -56,3 +59,8 @@ if (Backtrace_FOUND) else() message(WARNING "Backtrace not found, stack traces will be unavailable") endif() + +if (NOT LAGOM_TOOLS_ONLY) + find_package(simdutf REQUIRED) + target_link_libraries(AK PRIVATE simdutf::simdutf) +endif() diff --git a/Tests/AK/TestBase64.cpp b/Tests/AK/TestBase64.cpp index a04a0c8dd5b..5fcad1af25b 100644 --- a/Tests/AK/TestBase64.cpp +++ b/Tests/AK/TestBase64.cpp @@ -7,15 +7,13 @@ #include #include -#include #include TEST_CASE(test_decode) { auto decode_equal = [&](StringView input, StringView expected) { auto decoded = TRY_OR_FAIL(decode_base64(input)); - EXPECT(ByteString::copy(decoded) == expected); - EXPECT(expected.length() <= calculate_base64_decoded_length(input.bytes())); + EXPECT_EQ(StringView { decoded }, expected); }; decode_equal(""sv, ""sv); @@ -26,7 +24,7 @@ TEST_CASE(test_decode) decode_equal("Zm9vYmE="sv, "fooba"sv); decode_equal("Zm9vYmFy"sv, "foobar"sv); decode_equal(" Zm9vYmFy "sv, "foobar"sv); - decode_equal(" \n\r \t Zm9vYmFy \n"sv, "foobar"sv); + decode_equal(" \n\r \t Zm 9v \t YmFy \n"sv, "foobar"sv); decode_equal("aGVsbG8/d29ybGQ="sv, "hello?world"sv); } @@ -42,9 +40,7 @@ TEST_CASE(test_decode_invalid) EXPECT(decode_base64url("aGVsbG8/d29ybGQ="sv).is_error()); EXPECT(decode_base64("Y"sv).is_error()); - EXPECT(decode_base64("YQ"sv).is_error()); EXPECT(decode_base64("YQ="sv).is_error()); - EXPECT(decode_base64("PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxMC42MDUiIGhlaWdodD0iMTUuNTU1Ij48cGF0aCBmaWxsPSIjODg5IiBkPSJtMi44MjggMTUuNTU1IDcuNzc3LTcuNzc5TDIuODI4IDAgMCAyLjgyOGw0Ljk0OSA0Ljk0OEwwIDEyLjcyN2wyLjgyOCAyLjgyOHoiLz48L3N2Zz4"sv).is_error()); } TEST_CASE(test_decode_only_padding) @@ -65,8 +61,7 @@ TEST_CASE(test_encode) { auto encode_equal = [&](StringView input, StringView expected) { auto encoded = MUST(encode_base64(input.bytes())); - EXPECT(encoded == expected); - EXPECT_EQ(expected.length(), calculate_base64_encoded_length(input.bytes())); + EXPECT_EQ(encoded, expected); }; encode_equal(""sv, ""sv); @@ -82,8 +77,7 @@ TEST_CASE(test_urldecode) { auto decode_equal = [&](StringView input, StringView expected) { auto decoded = TRY_OR_FAIL(decode_base64url(input)); - EXPECT(ByteString::copy(decoded) == expected); - EXPECT(expected.length() <= calculate_base64_decoded_length(input.bytes())); + EXPECT_EQ(StringView { decoded }, expected); }; decode_equal(""sv, ""sv); @@ -104,8 +98,7 @@ TEST_CASE(test_urlencode) { auto encode_equal = [&](StringView input, StringView expected) { auto encoded = MUST(encode_base64url(input.bytes())); - EXPECT(encoded == expected); - EXPECT_EQ(expected.length(), calculate_base64_encoded_length(input.bytes())); + EXPECT_EQ(encoded, expected); }; encode_equal(""sv, ""sv); diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index 0eaf0b5be6e..0a6f36931fe 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -460,7 +460,6 @@ set(SOURCES HTML/ValidityState.cpp HighResolutionTime/Performance.cpp HighResolutionTime/TimeOrigin.cpp - Infra/Base64.cpp Infra/ByteSequences.cpp Infra/JSON.cpp Infra/Strings.cpp diff --git a/Userland/Libraries/LibWeb/Fetch/Infrastructure/URL.cpp b/Userland/Libraries/LibWeb/Fetch/Infrastructure/URL.cpp index 47c3533f33b..8755b48b120 100644 --- a/Userland/Libraries/LibWeb/Fetch/Infrastructure/URL.cpp +++ b/Userland/Libraries/LibWeb/Fetch/Infrastructure/URL.cpp @@ -6,10 +6,10 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include #include #include #include -#include namespace Web::Fetch::Infrastructure { @@ -79,7 +79,7 @@ ErrorOr process_data_url(URL::URL const& data_url) // 2. Set body to the forgiving-base64 decode of stringBody. // 3. If body is failure, then return failure. - body = TRY(Infra::decode_forgiving_base64(string_body)); + body = TRY(decode_base64(string_body)); // 4. Remove the last 6 code points from mimeType. // 5. Remove trailing U+0020 SPACE code points from mimeType, if any. diff --git a/Userland/Libraries/LibWeb/HTML/Window.cpp b/Userland/Libraries/LibWeb/HTML/Window.cpp index ecaeff4d44f..a8ce90b5312 100644 --- a/Userland/Libraries/LibWeb/HTML/Window.cpp +++ b/Userland/Libraries/LibWeb/HTML/Window.cpp @@ -6,7 +6,6 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include #include #include #include @@ -59,7 +58,6 @@ #include #include #include -#include #include #include #include diff --git a/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp b/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp index e4010bd47e0..af58d02d5f6 100644 --- a/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp +++ b/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -130,14 +129,14 @@ WebIDL::ExceptionOr WindowOrWorkerGlobalScopeMixin::atob(String const& d auto& realm = *vm.current_realm(); // 1. Let decodedData be the result of running forgiving-base64 decode on data. - auto decoded_data = Infra::decode_forgiving_base64(data.bytes_as_string_view()); + auto decoded_data = decode_base64(data); // 2. If decodedData is failure, then throw an "InvalidCharacterError" DOMException. if (decoded_data.is_error()) return WebIDL::InvalidCharacterError::create(realm, "Input string is not valid base64 data"_fly_string); // 3. Return decodedData. - // decode_base64() returns a byte string. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8. + // decode_base64() returns a byte buffer. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8. auto decoder = TextCodec::decoder_for_exact_name("ISO-8859-1"sv); VERIFY(decoder.has_value()); return TRY_OR_THROW_OOM(vm, decoder->to_utf8(decoded_data.value())); diff --git a/Userland/Libraries/LibWeb/Infra/Base64.cpp b/Userland/Libraries/LibWeb/Infra/Base64.cpp deleted file mode 100644 index 898006b0def..00000000000 --- a/Userland/Libraries/LibWeb/Infra/Base64.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2022-2023, the SerenityOS developers. - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Web::Infra { - -// https://infra.spec.whatwg.org/#forgiving-base64 -ErrorOr decode_forgiving_base64(StringView input) -{ - // 1. Remove all ASCII whitespace from data. - // FIXME: It is possible to avoid copying input here, it's just a bit tricky to remove the equal signs - StringBuilder builder; - for (auto character : input) { - if (!is_ascii_whitespace(character)) - TRY(builder.try_append(character)); - } - auto data = builder.string_view(); - - // 2. If data’s code point length divides by 4 leaving no remainder, then: - if (data.length() % 4 == 0) { - // If data ends with one or two U+003D (=) code points, then remove them from data. - if (data.ends_with("=="sv)) - data = data.substring_view(0, data.length() - 2); - else if (data.ends_with('=')) - data = data.substring_view(0, data.length() - 1); - } - - // 3. If data’s code point length divides by 4 leaving a remainder of 1, then return failure. - if (data.length() % 4 == 1) - return Error::from_string_literal("Invalid input length in forgiving base64 decode"); - - // 4. If data contains a code point that is not one of - // U+002B (+), U+002F (/), ASCII alphanumeric - // then return failure. - for (auto point : data) { - if (point != '+' && point != '/' && !is_ascii_alphanumeric(point)) - return Error::from_string_literal("Invalid character in forgiving base64 decode"); - } - - // 5. Let output be an empty byte sequence. - // 6. Let buffer be an empty buffer that can have bits appended to it. - Vector output; - u32 buffer = 0; - auto accumulated_bits = 0; - - auto add_to_buffer = [&](u8 number) { - VERIFY(number < 64); - u32 buffer_mask = number; - - if (accumulated_bits == 0) - buffer_mask <<= 18; - else if (accumulated_bits == 6) - buffer_mask <<= 12; - else if (accumulated_bits == 12) - buffer_mask <<= 6; - else if (accumulated_bits == 18) - buffer_mask <<= 0; - - buffer |= buffer_mask; - - accumulated_bits += 6; - }; - - auto append_bytes = [&]() { - output.append(static_cast((buffer & 0xff0000) >> 16)); - output.append(static_cast((buffer & 0xff00) >> 8)); - output.append(static_cast(buffer & 0xff)); - - buffer = 0; - accumulated_bits = 0; - }; - - auto alphabet_lookup_table = AK::base64_lookup_table(); - - // 7. Let position be a position variable for data, initially pointing at the start of data. - // 8. While position does not point past the end of data: - for (auto point : data) { - // 1. Find the code point pointed to by position in the second column of Table 1: The Base 64 Alphabet of RFC 4648. - // Let n be the number given in the first cell of the same row. [RFC4648] - auto n = alphabet_lookup_table[point]; - VERIFY(n >= 0); - - // 2. Append the six bits corresponding to n, most significant bit first, to buffer. - add_to_buffer(static_cast(n)); - - // 3. buffer has accumulated 24 bits, - if (accumulated_bits == 24) { - // interpret them as three 8-bit big-endian numbers. - // Append three bytes with values equal to those numbers to output, in the same order, and then empty buffer - append_bytes(); - } - } - - // 9. If buffer is not empty, it contains either 12 or 18 bits. - VERIFY(accumulated_bits == 0 || accumulated_bits == 12 || accumulated_bits == 18); - - // If it contains 12 bits, then discard the last four and interpret the remaining eight as an 8-bit big-endian number. - if (accumulated_bits == 12) - output.append(static_cast((buffer & 0xff0000) >> 16)); - - // If it contains 18 bits, then discard the last two and interpret the remaining 16 as two 8-bit big-endian numbers. - // Append the one or two bytes with values equal to those one or two numbers to output, in the same order. - if (accumulated_bits == 18) { - output.append(static_cast((buffer & 0xff0000) >> 16)); - output.append(static_cast((buffer & 0xff00) >> 8)); - } - - return ByteBuffer::copy(output); -} - -} diff --git a/Userland/Libraries/LibWeb/Infra/Base64.h b/Userland/Libraries/LibWeb/Infra/Base64.h deleted file mode 100644 index 381fbebb75d..00000000000 --- a/Userland/Libraries/LibWeb/Infra/Base64.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright (c) 2022-2023, the SerenityOS developers. - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#pragma once - -#include - -namespace Web::Infra { - -[[nodiscard]] ErrorOr decode_forgiving_base64(StringView); - -} diff --git a/vcpkg.json b/vcpkg.json index 853a93410fe..032a75ca8b9 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -19,6 +19,7 @@ "dav1d" ] }, + "simdutf", { "name": "skia", "platform": "osx", @@ -38,7 +39,6 @@ "platform": "android" }, "sqlite3", - "woff2", { "name": "vulkan", "platform": "!android" @@ -46,7 +46,8 @@ { "name": "vulkan-headers", "platform": "!android" - } + }, + "woff2" ], "overrides": [ { @@ -69,6 +70,10 @@ "name": "libavif", "version": "1.0.4#1" }, + { + "name": "simdutf", + "version": "5.2.5#0" + }, { "name": "skia", "version": "124#0"