ladybird/Userland/Libraries/LibGfx/ImageFormats/WebPWriterLossless.cpp
Nico Weber 1a9d8e8fbe LibCompress: When limiting huffman tree depth, sacrifice bottom of tree
Deflate and WebP can store at most 15 bits per symbol, meaning their
huffman trees can be at most 15 levels deep.

During construction, when we hit this level, we used to try again
with an ever lower frequency cap per symbol. This had the effect
of giving the symbols with the highest frequency lower frequencies
first, causing the most-frequent symbols to be merged. For example,
maybe the most-frequent symbol had 1 bit, and the 2nd-frequent
two bits (and everything else at least 3). With the cap, the two
most frequent symbols might both have 2 symbols, freeing up bits
for the lower levels of the tree.

This has the effect of making the most-frequent symbols longer at
first, which isn't great for file size.

Instead of using a frequency cap, ignore ever more of the low
bits of the frequency. This sacrifices resolution where it hurts
the lower levels of the tree first, and those are stored less
frequently.

For deflate, the 64 kiB block size means this doesn't have a big
effect, but for WebP it can have a big effect:

sunset-retro.png (876K): 2.02M -> 1.73M -- now (very slightly) smaller
than twice the input size! Maybe we'll be competitive one day.

(For wow.webp and 7z7c.webp, it has no effect, since we don't hit
the "tree too deep" case there, since those have relatively few
colors.)

No behavior change other than smaller file size. (No performance
cost either, and it's less code too.)
2024-05-26 21:00:55 +02:00

316 lines
14 KiB
C++

/*
* Copyright (c) 2024, Nico Weber <thakis@chromium.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
// Lossless format: https://developers.google.com/speed/webp/docs/webp_lossless_bitstream_specification
#include <AK/BitStream.h>
#include <AK/Debug.h>
#include <AK/Endian.h>
#include <AK/MemoryStream.h>
#include <LibCompress/DeflateTables.h>
#include <LibCompress/Huffman.h>
#include <LibGfx/Bitmap.h>
#include <LibGfx/ImageFormats/WebPSharedLossless.h>
#include <LibGfx/ImageFormats/WebPWriterLossless.h>
namespace Gfx {
NEVER_INLINE static ErrorOr<void> write_image_data(LittleEndianOutputBitStream& bit_stream, Bitmap const& bitmap, PrefixCodeGroup const& prefix_code_group)
{
// This is currently the hot loop. Keep performance in mind when you change it.
for (ARGB32 pixel : bitmap) {
u8 a = pixel >> 24;
u8 r = pixel >> 16;
u8 g = pixel >> 8;
u8 b = pixel;
TRY(prefix_code_group[0].write_symbol(bit_stream, g));
TRY(prefix_code_group[1].write_symbol(bit_stream, r));
TRY(prefix_code_group[2].write_symbol(bit_stream, b));
TRY(prefix_code_group[3].write_symbol(bit_stream, a));
}
return {};
}
struct CodeLengthSymbol {
u8 symbol { 0 };
u8 count { 0 }; // used for special symbols 16-18
};
// This is very similar to DeflateCompressor::encode_huffman_lengths().
// But:
// * size can be larger than 288 for green, is always 256 for r, b, a, and is always 40 for distance codes
// * code 16 has different semantics, requires last_non_zero_symbol
static size_t encode_huffman_lengths(ReadonlyBytes lengths, Array<CodeLengthSymbol, 256>& encoded_lengths)
{
size_t encoded_count = 0;
size_t i = 0;
u8 last_non_zero_symbol = 8; // "If code 16 is used before a non-zero value has been emitted, a value of 8 is repeated."
while (i < lengths.size()) {
if (lengths[i] == 0) {
auto zero_count = 0;
for (size_t j = i; j < min(lengths.size(), i + 138) && lengths[j] == 0; j++)
zero_count++;
if (zero_count < 3) { // below minimum repeated zero count
encoded_lengths[encoded_count++].symbol = 0;
i++;
continue;
}
if (zero_count <= 10) {
// "Code 17 emits a streak of zeros [3..10], i.e., 3 + ReadBits(3) times."
encoded_lengths[encoded_count].symbol = 17;
encoded_lengths[encoded_count++].count = zero_count;
} else {
// "Code 18 emits a streak of zeros of length [11..138], i.e., 11 + ReadBits(7) times."
encoded_lengths[encoded_count].symbol = 18;
encoded_lengths[encoded_count++].count = zero_count;
}
i += zero_count;
continue;
}
VERIFY(lengths[i] != 0);
last_non_zero_symbol = lengths[i];
encoded_lengths[encoded_count++].symbol = lengths[i++];
// "Code 16 repeats the previous non-zero value [3..6] times, i.e., 3 + ReadBits(2) times."
// This is different from deflate.
auto copy_count = 0;
for (size_t j = i; j < min(lengths.size(), i + 6) && lengths[j] == last_non_zero_symbol; j++)
copy_count++;
if (copy_count >= 3) {
encoded_lengths[encoded_count].symbol = 16;
encoded_lengths[encoded_count++].count = copy_count;
i += copy_count;
continue;
}
}
return encoded_count;
}
static ErrorOr<CanonicalCode> write_simple_code_lengths(LittleEndianOutputBitStream& bit_stream, ReadonlyBytes symbols)
{
VERIFY(symbols.size() <= 2);
static constexpr Array<u8, 1> empty { 0 };
if (symbols.size() == 0) {
// "Another special case is when all prefix code lengths are zeros (an empty prefix code). [...]
// empty prefix codes can be coded as those containing a single symbol 0."
symbols = empty;
}
unsigned non_zero_symbol_count = symbols.size();
TRY(bit_stream.write_bits(1u, 1u)); // Simple code length code.
TRY(bit_stream.write_bits(non_zero_symbol_count - 1, 1u)); // num_symbols - 1
if (symbols[0] <= 1) {
TRY(bit_stream.write_bits(0u, 1u)); // is_first_8bits: no
TRY(bit_stream.write_bits(symbols[0], 1u)); // symbol0
} else {
TRY(bit_stream.write_bits(1u, 1u)); // is_first_8bits: yes
TRY(bit_stream.write_bits(symbols[0], 8u)); // symbol0
}
if (non_zero_symbol_count > 1)
TRY(bit_stream.write_bits(symbols[1], 8u)); // symbol1
Array<u8, 256> bits_per_symbol {};
// "When coding a single leaf node [...], all but one code length are zeros, and the single leaf node value
// is marked with the length of 1 -- even when no bits are consumed when that single leaf node tree is used."
// CanonicalCode follows that convention too, even when describing simple code lengths.
bits_per_symbol[symbols[0]] = 1;
if (non_zero_symbol_count > 1)
bits_per_symbol[symbols[1]] = 1;
return MUST(CanonicalCode::from_bytes(bits_per_symbol));
}
static ErrorOr<CanonicalCode> write_normal_code_lengths(LittleEndianOutputBitStream& bit_stream, Array<u8, 256> const& bit_lengths, size_t alphabet_size)
{
// bit_lengths stores how many bits each symbol is encoded with.
// Drop trailing zero lengths.
// This will keep at least three symbols; else we would've called write_simple_code_lengths() instead.
// This is similar to the loops in Deflate::encode_block_lengths().
size_t code_count = bit_lengths.size();
while (bit_lengths[code_count - 1] == 0) {
code_count--;
VERIFY(code_count > 2);
}
Array<CodeLengthSymbol, 256> encoded_lengths {};
auto encoded_lengths_count = encode_huffman_lengths(bit_lengths.span().trim(code_count), encoded_lengths);
// The code to compute code length code lengths is very similar to some of the code in DeflateCompressor::flush().
// count code length frequencies
Array<u16, 19> code_lengths_frequencies { 0 };
for (size_t i = 0; i < encoded_lengths_count; i++) {
VERIFY(code_lengths_frequencies[encoded_lengths[i].symbol] < UINT16_MAX);
code_lengths_frequencies[encoded_lengths[i].symbol]++;
}
// generate optimal huffman code lengths code lengths
Array<u8, 19> code_lengths_bit_lengths {};
Compress::generate_huffman_lengths(code_lengths_bit_lengths, code_lengths_frequencies, 7); // deflate code length huffman can use up to 7 bits per symbol
// calculate actual code length code lengths count (without trailing zeros)
auto code_lengths_count = code_lengths_bit_lengths.size();
while (code_lengths_bit_lengths[kCodeLengthCodeOrder[code_lengths_count - 1]] == 0)
code_lengths_count--;
TRY(bit_stream.write_bits(0u, 1u)); // Normal code length code.
// This here isn't needed in Deflate because it always writes EndOfBlock. WebP does not have an EndOfBlock marker, so it needs this check.
if (code_lengths_count < 4)
code_lengths_count = 4;
dbgln_if(WEBP_DEBUG, "writing code_lengths_count: {}", code_lengths_count);
// WebP uses a different kCodeLengthCodeOrder than deflate. Other than that, the following is similar to a loop in Compress::write_dynamic_huffman().
// "int num_code_lengths = 4 + ReadBits(4);"
TRY(bit_stream.write_bits(code_lengths_count - 4u, 4u));
for (size_t i = 0; i < code_lengths_count; i++) {
TRY(bit_stream.write_bits(code_lengths_bit_lengths[kCodeLengthCodeOrder[i]], 3));
}
// Write code lengths. This is slightly different from deflate too -- deflate writes literal and distance lengths here,
// while WebP writes one of these codes each for g, r, b, a, and distance.
if (alphabet_size == encoded_lengths_count) {
TRY(bit_stream.write_bits(0u, 1u)); // max_symbol is alphabet_size
} else {
TRY(bit_stream.write_bits(1u, 1u)); // max_symbol is explicitly coded
// "int length_nbits = 2 + 2 * ReadBits(3);
// int max_symbol = 2 + ReadBits(length_nbits);"
// => length_nbits is at most 2 + 2*7 == 16
unsigned needed_length_nbits = ceil(log2(encoded_lengths_count - 2));
VERIFY(needed_length_nbits <= 16);
needed_length_nbits = ceil_div(needed_length_nbits, 2) * 2;
TRY(bit_stream.write_bits((needed_length_nbits - 2) / 2, 3u)); // length_nbits = 2 + 2 * 3 == 8
TRY(bit_stream.write_bits(encoded_lengths_count - 2, needed_length_nbits)); // max_symbol = 2 + 254
}
// The rest is identical to write_dynamic_huffman() again. (Code 16 has different semantics, but that doesn't matter here.)
auto code_lengths_code = MUST(CanonicalCode::from_bytes(code_lengths_bit_lengths));
for (size_t i = 0; i < encoded_lengths_count; i++) {
auto encoded_length = encoded_lengths[i];
TRY(code_lengths_code.write_symbol(bit_stream, encoded_length.symbol));
if (encoded_length.symbol == 16) {
// "Code 16 repeats the previous non-zero value [3..6] times, i.e., 3 + ReadBits(2) times."
TRY(bit_stream.write_bits<u8>(encoded_length.count - 3, 2));
} else if (encoded_length.symbol == 17) {
// "Code 17 emits a streak of zeros [3..10], i.e., 3 + ReadBits(3) times."
TRY(bit_stream.write_bits<u8>(encoded_length.count - 3, 3));
} else if (encoded_length.symbol == 18) {
// "Code 18 emits a streak of zeros of length [11..138], i.e., 11 + ReadBits(7) times."
TRY(bit_stream.write_bits<u8>(encoded_length.count - 11, 7));
}
}
return CanonicalCode::from_bytes(bit_lengths.span().trim(code_count));
}
static ErrorOr<void> write_VP8L_image_data(Stream& stream, Bitmap const& bitmap)
{
LittleEndianOutputBitStream bit_stream { MaybeOwned<Stream>(stream) };
// optional-transform = (%b1 transform optional-transform) / %b0
TRY(bit_stream.write_bits(0u, 1u)); // No transform for now.
// https://developers.google.com/speed/webp/docs/webp_lossless_bitstream_specification#5_image_data
// spatially-coded-image = color-cache-info meta-prefix data
// color-cache-info = %b0
// color-cache-info =/ (%b1 4BIT) ; 1 followed by color cache size
TRY(bit_stream.write_bits(0u, 1u)); // No color cache for now.
// meta-prefix = %b0 / (%b1 entropy-image)
TRY(bit_stream.write_bits(0u, 1u)); // No meta prefix for now.
// data = prefix-codes lz77-coded-image
// prefix-codes = prefix-code-group *prefix-codes
// prefix-code-group =
// 5prefix-code ; See "Interpretation of Meta Prefix Codes" to
// ; understand what each of these five prefix
// ; codes are for.
// We're writing a single prefix-code-group.
// "These codes are (in bitstream order):
// Prefix code #1: Used for green channel, backward-reference length, and color cache.
// Prefix code #2, #3, and #4: Used for red, blue, and alpha channels, respectively.
// Prefix code #5: Used for backward-reference distance."
// We use neither back-references not color cache entries yet.
// We can make this smarter later on.
size_t const color_cache_size = 0;
constexpr Array alphabet_sizes = to_array<size_t>({ 256 + 24 + static_cast<size_t>(color_cache_size), 256, 256, 256, 40 });
// If you add support for color cache: At the moment, CanonicalCodes does not support writing more than 288 symbols.
if (alphabet_sizes[0] > 288)
return Error::from_string_literal("Invalid alphabet size");
// We do use huffman coding by writing a single prefix-code-group for the entire image.
// FIXME: Consider using a meta-prefix image and using one prefix-code-group per tile.
Array<Array<u16, 256>, 4> symbol_frequencies {};
for (ARGB32 pixel : bitmap) {
static constexpr auto saturating_increment = [](u16& value) {
if (value < UINT16_MAX)
value++;
};
saturating_increment(symbol_frequencies[0][(pixel >> 8) & 0xff]); // green
saturating_increment(symbol_frequencies[1][(pixel >> 16) & 0xff]); // red
saturating_increment(symbol_frequencies[2][pixel & 0xff]); // blue
saturating_increment(symbol_frequencies[3][pixel >> 24]); // alpha
}
Array<Array<u8, 256>, 4> code_lengths {};
for (int i = 0; i < 4; ++i) {
// "Code [0..15] indicates literal code lengths." => the maximum bit length is 15.
Compress::generate_huffman_lengths(code_lengths[i], symbol_frequencies[i], 15);
}
PrefixCodeGroup prefix_code_group;
for (int i = 0; i < 4; ++i) {
u8 symbols[2];
unsigned non_zero_symbol_count = 0;
for (int j = 0; j < 256; ++j) {
if (code_lengths[i][j] != 0) {
if (non_zero_symbol_count < 2)
symbols[non_zero_symbol_count] = j;
non_zero_symbol_count++;
}
}
if (non_zero_symbol_count <= 2)
prefix_code_group[i] = TRY(write_simple_code_lengths(bit_stream, { symbols, non_zero_symbol_count }));
else
prefix_code_group[i] = TRY(write_normal_code_lengths(bit_stream, code_lengths[i], alphabet_sizes[i]));
}
// For code #5, use a simple empty code, since we don't use this yet.
prefix_code_group[4] = TRY(write_simple_code_lengths(bit_stream, {}));
// Image data.
TRY(write_image_data(bit_stream, bitmap, prefix_code_group));
// FIXME: Make ~LittleEndianOutputBitStream do this, or make it VERIFY() that it has happened at least.
TRY(bit_stream.align_to_byte_boundary());
TRY(bit_stream.flush_buffer_to_stream());
return {};
}
ErrorOr<ByteBuffer> compress_VP8L_image_data(Bitmap const& bitmap)
{
AllocatingMemoryStream vp8l_data_stream;
TRY(write_VP8L_image_data(vp8l_data_stream, bitmap));
return vp8l_data_stream.read_until_eof();
}
}