AK+LibWeb: Replace our home-grown base64 encoder/decoders with simdutf

We currently have 2 base64 coders: one in AK, another in LibWeb for a
"forgiving" implementation. ECMA-262 has an upcoming proposal which will
require a third implementation.

Instead, let's use the base64 implementation that is used by Node.js and
recommended by the upcoming proposal. It handles forgiving decoding as
well.

Our users of AK's implementation should be fine with the forgiving
implementation. The AK impl originally had naive forgiving behavior, but
that was removed solely for performance reasons.

Using http://mattmahoney.net/dc/enwik8.zip (100MB unzipped) as a test,
performance of our old home-grown implementations vs. the simdutf
implementation (on Linux x64):

                Encode    Decode
AK base64       0.226s    0.169s
LibWeb base64   N/A       1.244s
simdutf         0.161s    0.047s
This commit is contained in:
Timothy Flynn 2024-07-15 15:25:08 -04:00 committed by Andreas Kling
commit bfc9dc447f
Notes: sideshowbarker 2024-07-16 23:34:49 +09:00
11 changed files with 60 additions and 310 deletions

View file

@ -4,116 +4,51 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Assertions.h>
#define AK_DONT_REPLACE_STD
#include <AK/Base64.h>
#include <AK/Error.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <simdutf.h>
namespace AK {
size_t calculate_base64_decoded_length(StringView input)
static ErrorOr<ByteBuffer> decode_base64_impl(StringView input, simdutf::base64_options options)
{
auto length = input.length() * 3 / 4;
if (input.ends_with("="sv))
--length;
if (input.ends_with("=="sv))
--length;
return length;
}
size_t calculate_base64_encoded_length(ReadonlyBytes input)
{
return ((4 * input.size() / 3) + 3) & ~3;
}
static ErrorOr<ByteBuffer> decode_base64_impl(StringView input, ReadonlySpan<i16> alphabet_lookup_table)
{
input = input.trim_whitespace();
if (input.length() % 4 != 0)
return Error::from_string_literal("Invalid length of Base64 encoded string");
auto get = [&](size_t offset, bool* is_padding) -> ErrorOr<u8> {
if (offset >= input.length())
return 0;
auto ch = static_cast<unsigned char>(input[offset]);
if (ch == '=') {
if (!is_padding)
return Error::from_string_literal("Invalid '=' character outside of padding in base64 data");
*is_padding = true;
return 0;
}
i16 result = alphabet_lookup_table[ch];
if (result < 0)
return Error::from_string_literal("Invalid character in base64 data");
VERIFY(result < 256);
return { result };
};
ByteBuffer output;
TRY(output.try_resize(calculate_base64_decoded_length(input)));
TRY(output.try_resize(simdutf::maximal_binary_length_from_base64(input.characters_without_null_termination(), input.length())));
size_t input_offset = 0;
size_t output_offset = 0;
auto result = simdutf::base64_to_binary(
input.characters_without_null_termination(),
input.length(),
reinterpret_cast<char*>(output.data()),
options);
while (input_offset < input.length()) {
bool in2_is_padding = false;
bool in3_is_padding = false;
u8 const in0 = TRY(get(input_offset++, nullptr));
u8 const in1 = TRY(get(input_offset++, nullptr));
u8 const in2 = TRY(get(input_offset++, &in2_is_padding));
u8 const in3 = TRY(get(input_offset++, &in3_is_padding));
output[output_offset++] = (in0 << 2) | ((in1 >> 4) & 3);
if (!in2_is_padding)
output[output_offset++] = ((in1 & 0xf) << 4) | ((in2 >> 2) & 0xf);
if (!in3_is_padding)
output[output_offset++] = ((in2 & 0x3) << 6) | in3;
}
if (result.error != simdutf::SUCCESS)
return Error::from_string_literal("Invalid base64-encoded data");
output.resize(result.count);
return output;
}
static ErrorOr<String> encode_base64_impl(ReadonlyBytes input, ReadonlySpan<char> alphabet)
static ErrorOr<String> encode_base64_impl(StringView input, simdutf::base64_options options)
{
Vector<u8> output;
TRY(output.try_ensure_capacity(calculate_base64_encoded_length(input)));
auto get = [&](size_t const offset, bool* need_padding = nullptr) -> u8 {
if (offset >= input.size()) {
if (need_padding)
*need_padding = true;
return 0;
}
return input[offset];
};
// simdutf does not append padding to base64url encodings. We use the default encoding option here to allocate room
// for the padding characters that we will later append ourselves if necessary.
TRY(output.try_resize(simdutf::base64_length_from_binary(input.length(), simdutf::base64_default)));
for (size_t i = 0; i < input.size(); i += 3) {
bool is_8bit = false;
bool is_16bit = false;
auto size_written = simdutf::binary_to_base64(
input.characters_without_null_termination(),
input.length(),
reinterpret_cast<char*>(output.data()),
options);
u8 const in0 = get(i);
u8 const in1 = get(i + 1, &is_16bit);
u8 const in2 = get(i + 2, &is_8bit);
u8 const index0 = (in0 >> 2) & 0x3f;
u8 const index1 = ((in0 << 4) | (in1 >> 4)) & 0x3f;
u8 const index2 = ((in1 << 2) | (in2 >> 6)) & 0x3f;
u8 const index3 = in2 & 0x3f;
output.unchecked_append(alphabet[index0]);
output.unchecked_append(alphabet[index1]);
output.unchecked_append(is_16bit ? '=' : alphabet[index2]);
output.unchecked_append(is_8bit ? '=' : alphabet[index3]);
if (options == simdutf::base64_url) {
for (size_t i = size_written; i < output.size(); ++i)
output[i] = '=';
}
return String::from_utf8_without_validation(output);
@ -121,23 +56,22 @@ static ErrorOr<String> encode_base64_impl(ReadonlyBytes input, ReadonlySpan<char
ErrorOr<ByteBuffer> decode_base64(StringView input)
{
static constexpr auto lookup_table = base64_lookup_table();
return decode_base64_impl(input, lookup_table);
return decode_base64_impl(input, simdutf::base64_default);
}
ErrorOr<ByteBuffer> decode_base64url(StringView input)
{
static constexpr auto lookup_table = base64url_lookup_table();
return decode_base64_impl(input, lookup_table);
return decode_base64_impl(input, simdutf::base64_url);
}
ErrorOr<String> encode_base64(ReadonlyBytes input)
{
return encode_base64_impl(input, base64_alphabet);
return encode_base64_impl(input, simdutf::base64_default);
}
ErrorOr<String> encode_base64url(ReadonlyBytes input)
{
return encode_base64_impl(input, base64url_alphabet);
return encode_base64_impl(input, simdutf::base64_url);
}
}