From d294f150ed3f1ef2d4aae8bae410802a77bb7486 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 8 May 2024 18:57:53 -0400 Subject: [PATCH] LibGfx+LibCompress: WebPWriter performance regression reduction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves both Gfx::CanonicalCode::write_symbol() and Compress::CanonicalCode::write_symbol() inline. It also adds `__attribute__((always_inline))` on the arguments to visit() in the latter. (ALWAYS_INLINE doesn't work on lambdas.) Numbers with `ministat`: I ran once: Build/lagom/bin/image -o test.bmp Base/res/wallpapers/sunset-retro.png and then ran to bench: ~/src/hack/bench.py -n 20 -o bench_foo1.txt \ Build/lagom/bin/image -o test.webp test.bmp ...and then `ministat bench_foo1.txt bench_foo2.txt` to compare. The previous commit increased the time for this command by 38% compared to the before state. With this, it's an 8.6% regression. So still a regression, but a smaller one. Or, in other words, this commit reduces times by 21% compared to the previous commit. Numbers with hyperfine are similar -- with this on top of the previous commit, this is a 7-11% regression, instead of an almost 50% regression. (A local branch that changes how we compute CanonicalCodes so that we actually compress a bit is perf-neutral since the image writing code doesn't change.) `hyperfine 'image -o test.webp test.bmp'`: * Before: 23.7 ms ± 0.7 ms (116 runs) * Previous commit: 33.2 ms ± 0.8 ms (82 runs) * This commit: 25.5 ms ± 0.7 ms (102 runs) `hyperfine 'animation -o wow.webp giphy.gif'`: * Before: 85.5 ms ± 2.0 ms (34 runs) * Previous commit: 127.7 ms ± 4.4 ms (22 runs) * This commit: 95.3 ms ± 2.1 ms (31 runs) `hyperfine 'animation -o wow.webp 7z7c.gif'`: * Before: 12.6 ms ± 0.6 ms (198 runs) * Previous commit: 16.5 ms ± 0.9 ms (153 runs) * This commit: 13.5 ms ± 0.6 ms (186 runs) --- Userland/Libraries/LibCompress/Deflate.cpp | 9 --------- Userland/Libraries/LibCompress/Deflate.h | 9 +++++++++ .../Libraries/LibGfx/ImageFormats/WebPSharedLossless.cpp | 8 -------- .../Libraries/LibGfx/ImageFormats/WebPSharedLossless.h | 8 ++++++++ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Userland/Libraries/LibCompress/Deflate.cpp b/Userland/Libraries/LibCompress/Deflate.cpp index d35201af431..2c8ac8d2abd 100644 --- a/Userland/Libraries/LibCompress/Deflate.cpp +++ b/Userland/Libraries/LibCompress/Deflate.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -166,14 +165,6 @@ ErrorOr CanonicalCode::read_symbol(LittleEndianInputBitStream& stream) cons return Error::from_string_literal("Symbol exceeds maximum symbol number"); } -ErrorOr CanonicalCode::write_symbol(LittleEndianOutputBitStream& stream, u32 symbol) const -{ - auto code = symbol < m_bit_codes.size() ? m_bit_codes[symbol] : 0u; - auto length = symbol < m_bit_code_lengths.size() ? m_bit_code_lengths[symbol] : 0u; - TRY(stream.write_bits(code, length)); - return {}; -} - DeflateDecompressor::CompressedBlock::CompressedBlock(DeflateDecompressor& decompressor, CanonicalCode literal_codes, Optional distance_codes) : m_decompressor(decompressor) , m_literal_codes(literal_codes) diff --git a/Userland/Libraries/LibCompress/Deflate.h b/Userland/Libraries/LibCompress/Deflate.h index 00f237a0b98..74bf5c538e1 100644 --- a/Userland/Libraries/LibCompress/Deflate.h +++ b/Userland/Libraries/LibCompress/Deflate.h @@ -7,6 +7,7 @@ #pragma once +#include #include #include #include @@ -51,6 +52,14 @@ private: Vector m_bit_code_lengths {}; }; +ALWAYS_INLINE ErrorOr CanonicalCode::write_symbol(LittleEndianOutputBitStream& stream, u32 symbol) const +{ + auto code = symbol < m_bit_codes.size() ? m_bit_codes[symbol] : 0u; + auto length = symbol < m_bit_code_lengths.size() ? m_bit_code_lengths[symbol] : 0u; + TRY(stream.write_bits(code, length)); + return {}; +} + class DeflateDecompressor final : public Stream { private: class CompressedBlock { diff --git a/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.cpp b/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.cpp index 2545594d649..3b5b4928e0e 100644 --- a/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.cpp +++ b/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.cpp @@ -32,12 +32,4 @@ ErrorOr CanonicalCode::read_symbol(LittleEndianInputBitStream& bit_stream) [&bit_stream](Compress::CanonicalCode const& code) { return code.read_symbol(bit_stream); })); } -ErrorOr CanonicalCode::write_symbol(LittleEndianOutputBitStream& bit_stream, u32 symbol) const -{ - TRY(m_code.visit( - [&](u32 single_code) -> ErrorOr { VERIFY(symbol == single_code); return {};}, - [&](Compress::CanonicalCode const& code) { return code.write_symbol(bit_stream, symbol); })); - return {}; -} - } diff --git a/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.h b/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.h index f6da1c84fb2..34fb372db3f 100644 --- a/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.h +++ b/Userland/Libraries/LibGfx/ImageFormats/WebPSharedLossless.h @@ -39,6 +39,14 @@ private: Variant m_code; }; +ALWAYS_INLINE ErrorOr CanonicalCode::write_symbol(LittleEndianOutputBitStream& bit_stream, u32 symbol) const +{ + TRY(m_code.visit( + [&](u32 single_code) __attribute__((always_inline))->ErrorOr { VERIFY(symbol == single_code); return {}; }, + [&](Compress::CanonicalCode const& code) __attribute__((always_inline)) { return code.write_symbol(bit_stream, symbol); })); + return {}; +} + // https://developers.google.com/speed/webp/docs/webp_lossless_bitstream_specification#61_overview // "From here on, we refer to this set as a prefix code group." class PrefixCodeGroup {