diff --git a/AK/Debug.h.in b/AK/Debug.h.in index dba7d0bb6e2..29b106f34c1 100644 --- a/AK/Debug.h.in +++ b/AK/Debug.h.in @@ -150,10 +150,6 @@ # cmakedefine01 LINE_EDITOR_DEBUG #endif -#ifndef LZMA_DEBUG -# cmakedefine01 LZMA_DEBUG -#endif - #ifndef LZW_DEBUG # cmakedefine01 LZW_DEBUG #endif diff --git a/Libraries/LibCompress/CMakeLists.txt b/Libraries/LibCompress/CMakeLists.txt index 65ad66b9859..349581b71d0 100644 --- a/Libraries/LibCompress/CMakeLists.txt +++ b/Libraries/LibCompress/CMakeLists.txt @@ -1,6 +1,5 @@ set(SOURCES Deflate.cpp - Lzma.cpp PackBitsDecoder.cpp Zlib.cpp Gzip.cpp diff --git a/Libraries/LibCompress/Lzma.cpp b/Libraries/LibCompress/Lzma.cpp deleted file mode 100644 index 568dc39241a..00000000000 --- a/Libraries/LibCompress/Lzma.cpp +++ /dev/null @@ -1,1328 +0,0 @@ -/* - * Copyright (c) 2023, Tim Schumacher - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include - -namespace Compress { - -u32 LzmaHeader::dictionary_size() const -{ - // "If the value of dictionary size in properties is smaller than (1 << 12), - // the LZMA decoder must set the dictionary size variable to (1 << 12)." - constexpr u32 minimum_dictionary_size = (1 << 12); - if (unchecked_dictionary_size < minimum_dictionary_size) - return minimum_dictionary_size; - - return unchecked_dictionary_size; -} - -Optional LzmaHeader::uncompressed_size() const -{ - // We are making a copy of the packed field here because we would otherwise - // pass an unaligned reference to the constructor of Optional, which is - // undefined behavior. - auto uncompressed_size = encoded_uncompressed_size; - - // "If "Uncompressed size" field contains ones in all 64 bits, it means that - // uncompressed size is unknown and there is the "end marker" in stream, - // that indicates the end of decoding point." - if (uncompressed_size == placeholder_for_unknown_uncompressed_size) - return {}; - - // "In opposite case, if the value from "Uncompressed size" field is not - // equal to ((2^64) - 1), the LZMA stream decoding must be finished after - // specified number of bytes (Uncompressed size) is decoded. And if there - // is the "end marker", the LZMA decoder must read that marker also." - return uncompressed_size; -} - -ErrorOr LzmaHeader::decode_model_properties(u8 input_bits) -{ - // "Decodes the following values from the encoded model properties field: - // - // name Range Description - // lc [0, 8] the number of "literal context" bits - // lp [0, 4] the number of "literal pos" bits - // pb [0, 4] the number of "pos" bits - // - // Encoded using `((pb * 5 + lp) * 9 + lc)`." - - if (input_bits >= (9 * 5 * 5)) - return Error::from_string_literal("Encoded model properties value is larger than the highest possible value"); - - u8 literal_context_bits = input_bits % 9; - input_bits /= 9; - VERIFY(literal_context_bits >= 0 && literal_context_bits <= 8); - - u8 literal_position_bits = input_bits % 5; - input_bits /= 5; - VERIFY(literal_position_bits >= 0 && literal_position_bits <= 4); - - u8 position_bits = input_bits; - VERIFY(position_bits >= 0 && position_bits <= 4); - - return LzmaModelProperties { - .literal_context_bits = literal_context_bits, - .literal_position_bits = literal_position_bits, - .position_bits = position_bits, - }; -} - -ErrorOr LzmaHeader::encode_model_properties(LzmaModelProperties const& model_properties) -{ - if (model_properties.literal_context_bits > 8) - return Error::from_string_literal("LZMA literal context bits are too large to encode"); - - if (model_properties.literal_position_bits > 4) - return Error::from_string_literal("LZMA literal position bits are too large to encode"); - - if (model_properties.position_bits > 4) - return Error::from_string_literal("LZMA position bits are too large to encode"); - - return (model_properties.position_bits * 5 + model_properties.literal_position_bits) * 9 + model_properties.literal_context_bits; -} - -ErrorOr LzmaHeader::as_decompressor_options() const -{ - auto model_properties = TRY(decode_model_properties(encoded_model_properties)); - - return Compress::LzmaDecompressorOptions { - .literal_context_bits = model_properties.literal_context_bits, - .literal_position_bits = model_properties.literal_position_bits, - .position_bits = model_properties.position_bits, - .dictionary_size = dictionary_size(), - .uncompressed_size = uncompressed_size(), - .reject_end_of_stream_marker = false, - }; -} - -ErrorOr LzmaHeader::from_compressor_options(LzmaCompressorOptions const& options) -{ - auto encoded_model_properties = TRY(encode_model_properties({ - .literal_context_bits = options.literal_context_bits, - .literal_position_bits = options.literal_position_bits, - .position_bits = options.position_bits, - })); - - return LzmaHeader { - .encoded_model_properties = encoded_model_properties, - .unchecked_dictionary_size = options.dictionary_size, - .encoded_uncompressed_size = options.uncompressed_size.value_or(placeholder_for_unknown_uncompressed_size), - }; -} - -void LzmaState::initialize_to_default_probability(Span span) -{ - for (auto& entry : span) - entry = default_probability; -} - -ErrorOr> LzmaDecompressor::create_from_container(MaybeOwned stream, Optional> dictionary) -{ - auto header = TRY(stream->read_value()); - - return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()), move(dictionary))); -} - -ErrorOr> LzmaDecompressor::create_from_raw_stream(MaybeOwned stream, LzmaDecompressorOptions const& options, Optional> dictionary) -{ - if (!dictionary.has_value()) { - auto new_dictionary = TRY(CircularBuffer::create_empty(options.dictionary_size)); - dictionary = TRY(try_make(move(new_dictionary))); - } - - VERIFY((*dictionary)->capacity() >= options.dictionary_size); - - // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values." - auto literal_probabilities = TRY(FixedArray::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits)))); - - auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, dictionary.release_value(), move(literal_probabilities)))); - - TRY(decompressor->initialize_range_decoder()); - - return decompressor; -} - -LzmaState::LzmaState(FixedArray literal_probabilities) - : m_literal_probabilities(move(literal_probabilities)) -{ - initialize_to_default_probability(m_literal_probabilities.span()); - - for (auto& array : m_length_to_position_states) - initialize_to_default_probability(array); - - for (auto& array : m_binary_tree_distance_probabilities) - initialize_to_default_probability(array); - - initialize_to_default_probability(m_alignment_bit_probabilities); - - initialize_to_default_probability(m_is_match_probabilities); - initialize_to_default_probability(m_is_rep_probabilities); - initialize_to_default_probability(m_is_rep_g0_probabilities); - initialize_to_default_probability(m_is_rep_g1_probabilities); - initialize_to_default_probability(m_is_rep_g2_probabilities); - initialize_to_default_probability(m_is_rep0_long_probabilities); -} - -LzmaDecompressor::LzmaDecompressor(MaybeOwned stream, LzmaDecompressorOptions options, MaybeOwned dictionary, FixedArray literal_probabilities) - : LzmaState(move(literal_probabilities)) - , m_stream(move(stream)) - , m_options(move(options)) - , m_dictionary(move(dictionary)) -{ -} - -bool LzmaDecompressor::is_range_decoder_in_clean_state() const -{ - return m_range_decoder_code == 0; -} - -bool LzmaDecompressor::has_reached_expected_data_size() const -{ - if (!m_options.uncompressed_size.has_value()) - return false; - - return m_total_processed_bytes >= m_options.uncompressed_size.value(); -} - -ErrorOr LzmaDecompressor::initialize_range_decoder() -{ - // "The LZMA Encoder always writes ZERO in initial byte of compressed stream. - // That scheme allows to simplify the code of the Range Encoder in the - // LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must - // stop decoding and report error." - { - auto byte = TRY(m_stream->read_value()); - if (byte != 0) - return Error::from_string_literal("Initial byte of data stream is not zero"); - } - - // Read the initial bytes into the range decoder. - m_range_decoder_code = 0; - for (size_t i = 0; i < 4; i++) { - auto byte = TRY(m_stream->read_value()); - m_range_decoder_code = m_range_decoder_code << 8 | byte; - } - - m_range_decoder_range = 0xFFFFFFFF; - - return {}; -} - -ErrorOr LzmaDecompressor::append_input_stream(MaybeOwned stream, Optional uncompressed_size) -{ - m_stream = move(stream); - - TRY(initialize_range_decoder()); - - if (m_options.uncompressed_size.has_value() != uncompressed_size.has_value()) - return Error::from_string_literal("Appending LZMA streams with mismatching uncompressed size status"); - - if (uncompressed_size.has_value()) - *m_options.uncompressed_size += *uncompressed_size; - - return {}; -} - -ErrorOr LzmaDecompressor::normalize_range_decoder() -{ - // "The Normalize() function keeps the "Range" value in described range." - - if (m_range_decoder_range >= minimum_range_value) - return {}; - - m_range_decoder_range <<= 8; - m_range_decoder_code <<= 8; - - m_range_decoder_code |= TRY(m_stream->read_value()); - - VERIFY(m_range_decoder_range >= minimum_range_value); - - return {}; -} - -ErrorOr LzmaCompressor::shift_range_encoder() -{ - if ((m_range_encoder_code >> 32) == 0x01) { - // If there is an overflow, we can finalize the chain we were previously building. - // This includes incrementing both the cached byte and all the 0xFF bytes that we generate. - VERIFY(m_range_encoder_cached_byte != 0xFF); - TRY(m_stream->write_value(m_range_encoder_cached_byte + 1)); - for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) - TRY(m_stream->write_value(0x00)); - m_range_encoder_ff_chain_length = 0; - m_range_encoder_cached_byte = (m_range_encoder_code >> 24); - } else if ((m_range_encoder_code >> 24) == 0xFF) { - // If the byte to flush is 0xFF, it can potentially propagate an overflow and needs to be added to the chain. - m_range_encoder_ff_chain_length++; - } else { - // If the byte to flush isn't 0xFF, any future overflows will not be propagated beyond this point, - // so we can be sure that the built chain doesn't change anymore. - TRY(m_stream->write_value(m_range_encoder_cached_byte)); - for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) - TRY(m_stream->write_value(0xFF)); - m_range_encoder_ff_chain_length = 0; - m_range_encoder_cached_byte = (m_range_encoder_code >> 24); - } - - // In all three cases we now recorded the highest byte in some way, so we can shift it away and shift in a null byte as the lowest byte. - m_range_encoder_range <<= 8; - m_range_encoder_code <<= 8; - - // Since we are working with a 64-bit code, we need to limit it to 32 bits artificially. - m_range_encoder_code &= 0xFFFFFFFF; - - return {}; -} - -ErrorOr LzmaCompressor::normalize_range_encoder() -{ - u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range; - - // Logically, we should only ever build up an overflow that is smaller than or equal to 0x01. - VERIFY((maximum_range_value >> 32) <= 0x01); - - if (m_range_encoder_range >= minimum_range_value) - return {}; - - TRY(shift_range_encoder()); - - VERIFY(m_range_encoder_range >= minimum_range_value); - - return {}; -} - -ErrorOr LzmaDecompressor::decode_direct_bit() -{ - dbgln_if(LZMA_DEBUG, "Decoding direct bit {} with code = {:#x}, range = {:#x}", 1 - ((m_range_decoder_code - (m_range_decoder_range >> 1)) >> 31), m_range_decoder_code, m_range_decoder_range); - - m_range_decoder_range >>= 1; - m_range_decoder_code -= m_range_decoder_range; - - u32 temp = 0 - (m_range_decoder_code >> 31); - - m_range_decoder_code += m_range_decoder_range & temp; - - if (m_range_decoder_code == m_range_decoder_range) - return Error::from_string_literal("Reached an invalid state while decoding LZMA stream"); - - TRY(normalize_range_decoder()); - - return temp + 1; -} - -ErrorOr LzmaCompressor::encode_direct_bit(u8 value) -{ - dbgln_if(LZMA_DEBUG, "Encoding direct bit {} with code = {:#x}, range = {:#x}", value, m_range_encoder_code, m_range_encoder_range); - - m_range_encoder_range >>= 1; - - if (value != 0) - m_range_encoder_code += m_range_encoder_range; - - TRY(normalize_range_encoder()); - - return {}; -} - -ErrorOr LzmaDecompressor::decode_bit_with_probability(Probability& probability) -{ - // "The LZMA decoder provides the pointer to CProb variable that contains - // information about estimated probability for symbol 0 and the Range Decoder - // updates that CProb variable after decoding." - - u32 bound = (m_range_decoder_range >> probability_bit_count) * probability; - - dbgln_if(LZMA_DEBUG, "Decoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", m_range_decoder_code < bound ? 0 : 1, probability, bound, m_range_decoder_code, m_range_decoder_range); - - if (m_range_decoder_code < bound) { - probability += ((1 << probability_bit_count) - probability) >> probability_shift_width; - m_range_decoder_range = bound; - TRY(normalize_range_decoder()); - return 0; - } else { - probability -= probability >> probability_shift_width; - m_range_decoder_code -= bound; - m_range_decoder_range -= bound; - TRY(normalize_range_decoder()); - return 1; - } -} - -ErrorOr LzmaCompressor::encode_bit_with_probability(Probability& probability, u8 value) -{ - u32 bound = (m_range_encoder_range >> probability_bit_count) * probability; - - dbgln_if(LZMA_DEBUG, "Encoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", value, probability, bound, m_range_encoder_code, m_range_encoder_range); - - if (value == 0) { - probability += ((1 << probability_bit_count) - probability) >> probability_shift_width; - m_range_encoder_range = bound; - } else { - probability -= probability >> probability_shift_width; - m_range_encoder_code += bound; - m_range_encoder_range -= bound; - } - - TRY(normalize_range_encoder()); - return {}; -} - -ErrorOr LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span probability_tree) -{ - VERIFY(bit_count <= sizeof(u16) * 8); - VERIFY(probability_tree.size() >= 1ul << bit_count); - - // This has been modified from the reference implementation to unlink the result and the tree index, - // which should allow for better readability. - - u16 result = 0; - size_t tree_index = 1; - - for (size_t i = 0; i < bit_count; i++) { - u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); - result = (result << 1) | next_bit; - tree_index = (tree_index << 1) | next_bit; - } - - dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using bit tree", result, bit_count); - - return result; -} - -ErrorOr LzmaCompressor::encode_symbol_using_bit_tree(size_t bit_count, Span probability_tree, u16 value) -{ - VERIFY(bit_count <= sizeof(u16) * 8); - VERIFY(probability_tree.size() >= 1ul << bit_count); - VERIFY(value <= (1 << bit_count) - 1); - - auto original_value = value; - - // Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read. - value <<= sizeof(u16) * 8 - bit_count; - - size_t tree_index = 1; - - for (size_t i = 0; i < bit_count; i++) { - u8 const next_bit = (value & 0x8000) >> (sizeof(u16) * 8 - 1); - value <<= 1; - TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit)); - tree_index = (tree_index << 1) | next_bit; - } - - dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using bit tree", original_value, bit_count); - - return {}; -} - -ErrorOr LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree) -{ - VERIFY(bit_count <= sizeof(u16) * 8); - VERIFY(probability_tree.size() >= 1ul << bit_count); - - u16 result = 0; - size_t tree_index = 1; - - for (size_t i = 0; i < bit_count; i++) { - u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); - result |= next_bit << i; - tree_index = (tree_index << 1) | next_bit; - } - - dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using reverse bit tree", result, bit_count); - - return result; -} - -ErrorOr LzmaCompressor::encode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree, u16 value) -{ - VERIFY(bit_count <= sizeof(u16) * 8); - VERIFY(probability_tree.size() >= 1ul << bit_count); - VERIFY(value <= (1 << bit_count) - 1); - - auto original_value = value; - - size_t tree_index = 1; - - for (size_t i = 0; i < bit_count; i++) { - u8 const next_bit = value & 1; - value >>= 1; - TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit)); - tree_index = (tree_index << 1) | next_bit; - } - - dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using reverse bit tree", original_value, bit_count); - - return {}; -} - -ErrorOr LzmaDecompressor::decode_literal_to_output_buffer() -{ - u8 previous_byte = 0; - if (m_dictionary->seekback_limit() > 0) { - auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1)); - VERIFY(read_bytes.size() == sizeof(previous_byte)); - } - - // "To select the table for decoding it uses the context that consists of - // (lc) high bits from previous literal and (lp) low bits from value that - // represents current position in outputStream." - u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1); - u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); - u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; - - Span selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size); - - // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer. - // The top bit is only used to track how much we have decoded already, and to select the correct probability table. - u16 result = 1; - - // "If (State > 7), the Literal Decoder also uses "matchByte" that represents - // the byte in OutputStream at position the is the DISTANCE bytes before - // current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair - // of latest decoded match." - // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch. - // Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one. - if (m_state >= 7) { - u8 matched_byte = 0; - auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset())); - VERIFY(read_bytes.size() == sizeof(matched_byte)); - - dbgln_if(LZMA_DEBUG, "Decoding literal using match byte {:#x}", matched_byte); - - do { - u8 match_bit = (matched_byte >> 7) & 1; - matched_byte <<= 1; - - u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result])); - result = result << 1 | decoded_bit; - - if (match_bit != decoded_bit) - break; - } while (result < 0x100); - } - - while (result < 0x100) - result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result])); - - u8 actual_result = result - 0x100; - - size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) }); - VERIFY(written_bytes == sizeof(actual_result)); - m_total_processed_bytes += sizeof(actual_result); - - dbgln_if(LZMA_DEBUG, "Decoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", actual_result, m_state, literal_state, previous_byte); - - return {}; -} - -ErrorOr LzmaCompressor::encode_literal(u8 literal) -{ - // This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted. - - TRY(encode_match_type(MatchType::Literal)); - - // Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one. - u8 previous_byte = 0; - if (m_dictionary->seekback_limit() - m_dictionary->used_space() > 1) { - auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 2 + m_dictionary->used_space())); - VERIFY(read_bytes.size() == sizeof(previous_byte)); - } - u16 const literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1); - u16 const literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); - u16 const literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; - - Span selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size); - - auto original_literal = literal; - u16 result = 1; - - if (m_state >= 7) { - u8 matched_byte = 0; - auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset() + m_dictionary->used_space() + 1)); - VERIFY(read_bytes.size() == sizeof(matched_byte)); - - dbgln_if(LZMA_DEBUG, "Encoding literal using match byte {:#x}", matched_byte); - - do { - u8 const match_bit = (matched_byte >> 7) & 1; - matched_byte <<= 1; - - u8 const encoded_bit = (literal & 0x80) >> 7; - literal <<= 1; - - TRY(encode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result], encoded_bit)); - result = result << 1 | encoded_bit; - - if (match_bit != encoded_bit) - break; - } while (result < 0x100); - } - - while (result < 0x100) { - u8 const encoded_bit = (literal & 0x80) >> 7; - literal <<= 1; - - TRY(encode_bit_with_probability(selected_probability_table[result], encoded_bit)); - - result = (result << 1) | encoded_bit; - } - - m_total_processed_bytes += sizeof(literal); - - dbgln_if(LZMA_DEBUG, "Encoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", original_literal, m_state, literal_state, previous_byte); - - update_state_after_literal(); - - return {}; -} - -ErrorOr LzmaCompressor::encode_existing_match(size_t real_distance, size_t real_length) -{ - VERIFY(real_distance >= normalized_to_real_match_distance_offset); - u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset; - - VERIFY(real_length >= normalized_to_real_match_length_offset); - u16 const normalized_length = real_length - normalized_to_real_match_length_offset; - - if (normalized_distance == m_rep0) { - TRY(encode_match_type(MatchType::RepMatch0)); - } else if (normalized_distance == m_rep1) { - TRY(encode_match_type(MatchType::RepMatch1)); - - u32 const distance = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } else if (normalized_distance == m_rep2) { - TRY(encode_match_type(MatchType::RepMatch2)); - - u32 const distance = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } else if (normalized_distance == m_rep3) { - TRY(encode_match_type(MatchType::RepMatch3)); - - u32 const distance = m_rep3; - m_rep3 = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } else { - VERIFY_NOT_REACHED(); - } - - TRY(encode_normalized_match_length(m_rep_length_coder, normalized_length)); - update_state_after_rep(); - MUST(m_dictionary->discard(real_length)); - m_total_processed_bytes += real_length; - - return {}; -} - -ErrorOr LzmaCompressor::encode_new_match(size_t real_distance, size_t real_length) -{ - VERIFY(real_distance >= normalized_to_real_match_distance_offset); - u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset; - - VERIFY(real_length >= normalized_to_real_match_length_offset); - u16 const normalized_length = real_length - normalized_to_real_match_length_offset; - - TRY(encode_normalized_simple_match(normalized_distance, normalized_length)); - - MUST(m_dictionary->discard(real_length)); - m_total_processed_bytes += real_length; - - return {}; -} - -ErrorOr LzmaCompressor::encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length) -{ - TRY(encode_match_type(MatchType::SimpleMatch)); - - m_rep3 = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - - TRY(encode_normalized_match_length(m_length_coder, normalized_length)); - - update_state_after_match(); - - TRY(encode_normalized_match_distance(normalized_length, normalized_distance)); - m_rep0 = normalized_distance; - - return {}; -} - -LzmaState::LzmaLengthCoderState::LzmaLengthCoderState() -{ - for (auto& array : m_low_length_probabilities) - initialize_to_default_probability(array); - - for (auto& array : m_medium_length_probabilities) - initialize_to_default_probability(array); - - initialize_to_default_probability(m_high_length_probabilities); -} - -ErrorOr LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state) -{ - // "LZMA uses "posState" value as context to select the binary tree - // from LowCoder and MidCoder binary tree arrays:" - u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); - - // "The following scheme is used for the match length encoding: - // - // Binary encoding Binary Tree structure Zero-based match length - // sequence (binary + decimal): - // - // 0 xxx LowCoder[posState] xxx - if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0) - return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span())); - - // 1 0 yyy MidCoder[posState] yyy + 8 - if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0) - return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8; - - // 1 1 zzzzzzzz HighCoder zzzzzzzz + 16" - return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16; -} - -ErrorOr LzmaCompressor::encode_normalized_match_length(LzmaLengthCoderState& length_coder_state, u16 normalized_length) -{ - u16 const position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); - - if (normalized_length < 8) { - TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 0)); - TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_low_length_probabilities[position_state].span(), normalized_length)); - return {}; - } - - TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 1)); - - if (normalized_length < 16) { - TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 0)); - TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_medium_length_probabilities[position_state].span(), normalized_length - 8)); - return {}; - } - - TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 1)); - TRY(encode_symbol_using_bit_tree(8, length_coder_state.m_high_length_probabilities.span(), normalized_length - 16)); - return {}; -} - -ErrorOr LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length) -{ - // "LZMA uses normalized match length (zero-based length) - // to calculate the context state "lenState" do decode the distance value." - u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1); - - // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit - // tree decoder from PosSlotDecoder array." - u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span())); - - // "The encoding scheme for distance value is shown in the following table: - // - // posSlot (decimal) / - // zero-based distance (binary) - // 0 0 - // 1 1 - // 2 10 - // 3 11 - // - // 4 10 x - // 5 11 x - // 6 10 xx - // 7 11 xx - // 8 10 xxx - // 9 11 xxx - // 10 10 xxxx - // 11 11 xxxx - // 12 10 xxxxx - // 13 11 xxxxx - // - // 14 10 yy zzzz - // 15 11 yy zzzz - // 16 10 yyy zzzz - // 17 11 yyy zzzz - // ... - // 62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz - // 63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz - // - // where - // "x ... x" means the sequence of binary symbols encoded with binary tree and - // "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. - // "y" means direct bit encoded with range coder. - // "zzzz" means the sequence of four binary symbols encoded with binary - // tree with "Reverse" scheme, where one common binary tree "AlignDecoder" - // is used for all posSlot values." - - // "If (posSlot < 4), the "dist" value is equal to posSlot value." - if (position_slot < first_position_slot_with_binary_tree_bits) - return position_slot; - - // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set. - u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0)); - - // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of - // the high bits of "dist" value and the number of the low bits. - // If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. - // (one separated bit tree decoder per one posSlot value) and "Reverse" scheme." - if (position_slot < first_position_slot_with_direct_encoded_bits) { - size_t number_of_bits_to_decode = (position_slot / 2) - 1; - auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits]; - return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree)); - } - - // " if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct - // bits from RangeDecoder and the low 4 bits are decoded with a bit tree - // decoder "AlignDecoder" with "Reverse" scheme." - size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2; - for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) { - distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit()); - } - return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities)); -} - -ErrorOr LzmaCompressor::encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance) -{ - u16 const length_state = min(normalized_match_length, number_of_length_to_position_states - 1); - - if (normalized_match_distance < first_position_slot_with_binary_tree_bits) { - // The normalized distance gets encoded as the position slot. - TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), normalized_match_distance)); - return {}; - } - - // Note: This has been deduced, there is no immediate relation to the decoding function. - u16 const distance_log2 = AK::log2(normalized_match_distance); - u16 number_of_distance_bits = count_required_bits(normalized_match_distance); - u16 const position_slot = (distance_log2 << 1) + ((normalized_match_distance >> (distance_log2 - 1)) & 1); - - TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), position_slot)); - - // Mask off the top two bits of the value, those are already encoded by the position slot. - normalized_match_distance &= (1 << (number_of_distance_bits - 2)) - 1; - number_of_distance_bits -= 2; - - if (position_slot < first_position_slot_with_direct_encoded_bits) { - // The value gets encoded using only a reverse bit tree coder. - auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits]; - TRY(encode_symbol_using_reverse_bit_tree(number_of_distance_bits, selected_probability_tree, normalized_match_distance)); - return {}; - } - - // The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits). - auto direct_bits = normalized_match_distance & ~((1 << number_of_alignment_bits) - 1); - auto const alignment_bits = normalized_match_distance & ((1 << number_of_alignment_bits) - 1); - - // Shift to-be-written direct bits to the most significant position for easier access. - direct_bits <<= sizeof(direct_bits) * 8 - number_of_distance_bits; - - for (auto i = 0u; i < number_of_distance_bits - number_of_alignment_bits; i++) { - TRY(encode_direct_bit((direct_bits & 0x80000000) ? 1 : 0)); - direct_bits <<= 1; - } - - TRY(encode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities, alignment_bits)); - - return {}; -} - -u32 LzmaState::current_repetition_offset() const -{ - // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer). - // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back. - // The special rep-value 0xFFFFFFFF is reserved for marking the end of the stream, - // so this should never overflow. - VERIFY(m_rep0 <= NumericLimits::max() - normalized_to_real_match_distance_offset); - return m_rep0 + normalized_to_real_match_distance_offset; -} - -void LzmaState::update_state_after_literal() -{ - if (m_state < 4) - m_state = 0; - else if (m_state < 10) - m_state -= 3; - else - m_state -= 6; -} - -void LzmaState::update_state_after_match() -{ - if (m_state < 7) - m_state = 7; - else - m_state = 10; -} - -void LzmaState::update_state_after_rep() -{ - if (m_state < 7) - m_state = 8; - else - m_state = 11; -} - -void LzmaState::update_state_after_short_rep() -{ - if (m_state < 7) - m_state = 9; - else - m_state = 11; -} - -ErrorOr LzmaDecompressor::decode_match_type() -{ - // "The decoder calculates "state2" variable value to select exact variable from - // "IsMatch" and "IsRep0Long" arrays." - u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); - u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; - - // "The decoder uses the following code flow scheme to select exact - // type of LITERAL or MATCH: - // - // IsMatch[state2] decode - // 0 - the Literal" - if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) { - dbgln_if(LZMA_DEBUG, "Decoded match type 'Literal'"); - return MatchType::Literal; - } - - // " 1 - the Match - // IsRep[state] decode - // 0 - Simple Match" - if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) { - dbgln_if(LZMA_DEBUG, "Decoded match type 'SimpleMatch'"); - return MatchType::SimpleMatch; - } - - // " 1 - Rep Match - // IsRepG0[state] decode - // 0 - the distance is rep0" - if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) { - // " IsRep0Long[state2] decode - // 0 - Short Rep Match" - if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) { - dbgln_if(LZMA_DEBUG, "Decoded match type 'ShortRepMatch'"); - return MatchType::ShortRepMatch; - } - - // " 1 - Rep Match 0" - dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch0'"); - return MatchType::RepMatch0; - } - - // " 1 - - // IsRepG1[state] decode - // 0 - Rep Match 1" - if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) { - dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch1'"); - return MatchType::RepMatch1; - } - - // " 1 - - // IsRepG2[state] decode - // 0 - Rep Match 2" - if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) { - dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch2'"); - return MatchType::RepMatch2; - } - - // " 1 - Rep Match 3" - dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch3'"); - return MatchType::RepMatch3; -} - -ErrorOr LzmaCompressor::encode_match_type(MatchType match_type) -{ - u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); - u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; - - if (match_type == MatchType::Literal) { - TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 0)); - dbgln_if(LZMA_DEBUG, "Encoded match type 'Literal'"); - return {}; - } - TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 1)); - - if (match_type == MatchType::SimpleMatch) { - TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 0)); - dbgln_if(LZMA_DEBUG, "Encoded match type 'SimpleMatch'"); - return {}; - } - TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 1)); - - if (match_type == MatchType::ShortRepMatch || match_type == MatchType::RepMatch0) { - TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 0)); - TRY(encode_bit_with_probability(m_is_rep0_long_probabilities[state2], match_type == MatchType::RepMatch0)); - if constexpr (LZMA_DEBUG) { - if (match_type == RepMatch0) - dbgln("Encoded match type 'RepMatch0'"); - else - dbgln("Encoded match type 'ShortRepMatch'"); - } - return {}; - } - TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 1)); - - if (match_type == MatchType::RepMatch1) { - TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 0)); - dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch1'"); - return {}; - } - TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 1)); - - if (match_type == MatchType::RepMatch2) { - TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 0)); - dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch2'"); - return {}; - } - TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 1)); - dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch3'"); - return {}; -} - -ErrorOr LzmaCompressor::encode_once() -{ - // Check if any of our existing match distances are currently usable. - Vector const existing_distances { - m_rep0 + normalized_to_real_match_distance_offset, - m_rep1 + normalized_to_real_match_distance_offset, - m_rep2 + normalized_to_real_match_distance_offset, - m_rep3 + normalized_to_real_match_distance_offset, - }; - auto existing_distance_result = m_dictionary->find_copy_in_seekback(existing_distances, m_dictionary->used_space(), normalized_to_real_match_length_offset); - - if (existing_distance_result.has_value()) { - auto selected_match = existing_distance_result.release_value(); - TRY(encode_existing_match(selected_match.distance, selected_match.length)); - return {}; - } - - // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets. - auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset); - - if (new_distance_result.has_value()) { - auto selected_match = new_distance_result.release_value(); - TRY(encode_new_match(selected_match.distance, selected_match.length)); - return {}; - } - - // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal. - u8 next_byte { 0 }; - TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) })); - TRY(encode_literal(next_byte)); - return {}; -} - -ErrorOr LzmaDecompressor::read_some(Bytes bytes) -{ - while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) { - if (m_found_end_of_stream_marker) - break; - - if (has_reached_expected_data_size()) { - // If the decoder is in a clean state, we assume that this is fine. - if (is_range_decoder_in_clean_state()) - break; - - // Otherwise, we give it one last try to find the end marker in the remaining data. - } - - auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr { - VERIFY(!m_leftover_match_length.has_value()); - - if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length) - return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size"); - - auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length)); - - m_total_processed_bytes += copied_length; - real_length -= copied_length; - - if (real_length > 0) - m_leftover_match_length = real_length; - - return {}; - }; - - // If we have a leftover part of a repeating match, we should finish that first. - if (m_leftover_match_length.has_value()) { - TRY(copy_match_to_buffer(m_leftover_match_length.release_value())); - continue; - } - - auto const match_type = TRY(decode_match_type()); - - // If we are looking for EOS, but find another match type, the stream is also corrupted. - if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch) - return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match"); - - if (match_type == MatchType::Literal) { - // "At first the LZMA decoder must check that it doesn't exceed - // specified uncompressed size." - // This is already checked for at the beginning of the loop. - - // "Then it decodes literal value and puts it to sliding window." - TRY(decode_literal_to_output_buffer()); - - // "Then the decoder must update the "state" value." - update_state_after_literal(); - continue; - } - - if (match_type == MatchType::SimpleMatch) { - // "The distance history table is updated with the following scheme:" - m_rep3 = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - - // "The zero-based length is decoded with "LenDecoder"." - u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder)); - - // "The state is update with UpdateState_Match function." - update_state_after_match(); - - // "and the new "rep0" value is decoded with DecodeDistance." - m_rep0 = TRY(decode_normalized_match_distance(normalized_length)); - - // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have - // "End of stream" marker, so we can stop decoding and check finishing - // condition in Range Decoder" - if (m_rep0 == end_of_stream_marker) { - // If we should reject end-of-stream markers, do so now. - // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option. - if (m_options.reject_end_of_stream_marker) - return Error::from_string_literal("An end-of-stream marker was found, but the LZMA stream is configured to reject them"); - - // The range decoder condition is checked after breaking out of the loop. - m_found_end_of_stream_marker = true; - continue; - } - - // If we are looking for EOS, but haven't found it here, the stream is corrupted. - if (has_reached_expected_data_size()) - return Error::from_string_literal("First simple match after the expected uncompressed size is not the EOS marker"); - - // "If uncompressed size is defined, LZMA decoder must check that it doesn't - // exceed that specified uncompressed size." - // This is being checked for in the common "copy to buffer" implementation. - - // "Also the decoder must check that "rep0" value is not larger than dictionary size - // and is not larger than the number of already decoded bytes." - if (current_repetition_offset() > m_dictionary->seekback_limit()) - return Error::from_string_literal("rep0 value is larger than the possible lookback size"); - - // "Then the decoder must copy match bytes as described in - // "The match symbols copying" section." - TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); - - continue; - } - - if (match_type == MatchType::ShortRepMatch) { - // "LZMA doesn't update the distance history." - - // "If the subtype is "Short Rep Match", the decoder updates the state, puts - // the one byte from window to current position in window and goes to next - // MATCH/LITERAL symbol." - update_state_after_short_rep(); - - TRY(copy_match_to_buffer(1)); - - continue; - } - - // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not - // run the detection for other match types and to not switch around the distance history. - - if (match_type == MatchType::RepMatch1) { - u32 distance = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } - - if (match_type == MatchType::RepMatch2) { - u32 distance = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } - - if (match_type == MatchType::RepMatch3) { - u32 distance = m_rep3; - m_rep3 = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } - - // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based - // length of match with "RepLenDecoder" decoder." - u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder)); - - // "Then it updates the state." - update_state_after_rep(); - - // "Then the decoder must copy match bytes as described in - // "The Match symbols copying" section." - TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); - } - - if (m_found_end_of_stream_marker || has_reached_expected_data_size()) { - if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value()) - return Error::from_string_literal("Found end-of-stream marker earlier than expected"); - - if (!is_range_decoder_in_clean_state()) - return Error::from_string_literal("LZMA stream ends in an unclean state"); - } - - return m_dictionary->read(bytes); -} - -ErrorOr LzmaDecompressor::write_some(ReadonlyBytes) -{ - return Error::from_errno(EBADF); -} - -bool LzmaDecompressor::is_eof() const -{ - if (m_dictionary->used_space() > 0) - return false; - - if (has_reached_expected_data_size()) - return true; - - return m_found_end_of_stream_marker; -} - -bool LzmaDecompressor::is_open() const -{ - return true; -} - -void LzmaDecompressor::close() -{ -} - -ErrorOr> LzmaCompressor::create_container(MaybeOwned stream, LzmaCompressorOptions const& options) -{ - auto dictionary = TRY(try_make(TRY(SearchableCircularBuffer::create_empty(options.dictionary_size + largest_real_match_length)))); - - // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values." - auto literal_probabilities = TRY(FixedArray::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits)))); - - auto header = TRY(LzmaHeader::from_compressor_options(options)); - TRY(stream->write_value(header)); - - auto compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaCompressor(move(stream), options, move(dictionary), move(literal_probabilities)))); - - return compressor; -} - -LzmaCompressor::LzmaCompressor(MaybeOwned stream, Compress::LzmaCompressorOptions options, MaybeOwned dictionary, FixedArray literal_probabilities) - : LzmaState(move(literal_probabilities)) - , m_stream(move(stream)) - , m_options(move(options)) - , m_dictionary(move(dictionary)) -{ -} - -ErrorOr LzmaCompressor::read_some(Bytes) -{ - return Error::from_errno(EBADF); -} - -ErrorOr LzmaCompressor::write_some(ReadonlyBytes bytes) -{ - // Fill the input buffer until it's full or until we can't read any more data. - size_t processed_bytes = min(bytes.size(), largest_real_match_length - m_dictionary->used_space()); - bytes = bytes.trim(processed_bytes); - - while (bytes.size() > 0) { - auto const written_bytes = m_dictionary->write(bytes); - bytes = bytes.slice(written_bytes); - } - - VERIFY(m_dictionary->used_space() <= largest_real_match_length); - - if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() > m_options.uncompressed_size.value()) - return Error::from_string_literal("Tried to compress more LZMA data than announced"); - - TRY(encode_once()); - - // If we read enough data to reach the final uncompressed size, flush automatically. - // Flushing will handle encoding the remaining data for us and finalize the stream. - if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() >= m_options.uncompressed_size.value()) - TRY(flush()); - - return processed_bytes; -} - -ErrorOr LzmaCompressor::flush() -{ - if (m_has_flushed_data) - return Error::from_string_literal("Flushed an LZMA stream twice"); - - while (m_dictionary->used_space() > 0) - TRY(encode_once()); - - if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value()) - return Error::from_string_literal("Flushing LZMA data with known but unreached uncompressed size"); - - // The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously, - // but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size. - if (!m_options.uncompressed_size.has_value()) - TRY(encode_normalized_simple_match(end_of_stream_marker, 0)); - - // Shifting the range encoder using the normal operation handles any pending overflows. - TRY(shift_range_encoder()); - - // Now, the remaining bytes are the cached byte, the chain of 0xFF, and the upper 3 bytes of the current `code`. - // Incrementing the values does not have to be considered as no overflows are pending. The fourth byte is the - // null byte that we just shifted in, which should not be flushed as it would be extraneous junk data. - TRY(m_stream->write_value(m_range_encoder_cached_byte)); - for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) - TRY(m_stream->write_value(0xFF)); - TRY(m_stream->write_value(m_range_encoder_code >> 24)); - TRY(m_stream->write_value(m_range_encoder_code >> 16)); - TRY(m_stream->write_value(m_range_encoder_code >> 8)); - - m_has_flushed_data = true; - return {}; -} - -bool LzmaCompressor::is_eof() const -{ - return true; -} - -bool LzmaCompressor::is_open() const -{ - return !m_has_flushed_data; -} - -void LzmaCompressor::close() -{ - if (!m_has_flushed_data) { - // Note: We need a better API for specifying things like this. - flush().release_value_but_fixme_should_propagate_errors(); - } -} - -LzmaCompressor::~LzmaCompressor() -{ - if (!m_has_flushed_data) { - // Note: We need a better API for specifying things like this. - flush().release_value_but_fixme_should_propagate_errors(); - } -} - -} diff --git a/Libraries/LibCompress/Lzma.h b/Libraries/LibCompress/Lzma.h deleted file mode 100644 index 066c97612c9..00000000000 --- a/Libraries/LibCompress/Lzma.h +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2023, Tim Schumacher - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace Compress { - -// This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain. -// LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z - -struct LzmaModelProperties { - u8 literal_context_bits; - u8 literal_position_bits; - u8 position_bits; -}; - -struct LzmaDecompressorOptions { - u8 literal_context_bits { 0 }; - u8 literal_position_bits { 0 }; - u8 position_bits { 0 }; - u32 dictionary_size { 0 }; - Optional uncompressed_size; - bool reject_end_of_stream_marker { false }; -}; - -struct LzmaCompressorOptions { - // Note: The default settings have been chosen based on the default settings of other LZMA compressors. - u8 literal_context_bits { 3 }; - u8 literal_position_bits { 0 }; - u8 position_bits { 2 }; - u32 dictionary_size { 8 * MiB }; - Optional uncompressed_size {}; -}; - -// Described in section "lzma file format". -struct [[gnu::packed]] LzmaHeader { - u32 dictionary_size() const; - Optional uncompressed_size() const; - - ErrorOr as_decompressor_options() const; - static ErrorOr from_compressor_options(LzmaCompressorOptions const&); - - static ErrorOr decode_model_properties(u8 input_bits); - static ErrorOr encode_model_properties(LzmaModelProperties const&); - - u8 encoded_model_properties; - u32 unchecked_dictionary_size; - u64 encoded_uncompressed_size; - - static constexpr u64 placeholder_for_unknown_uncompressed_size = UINT64_MAX; -}; -static_assert(sizeof(LzmaHeader) == 13); - -class LzmaState { -protected: - // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables. - // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048). - // The default probability for most counters is 0.5. - using Probability = u16; - static constexpr size_t probability_bit_count = 11; - static constexpr Probability default_probability = (1 << probability_bit_count) / 2; - static void initialize_to_default_probability(Span); - - // The significance of the shift width is not explained and appears to be a magic constant. - static constexpr size_t probability_shift_width = 5; - - // "The value of the "Range" variable before each bit decoding can not be smaller than ((UInt32)1 << 24)." - static constexpr u32 minimum_range_value = 1 << 24; - - LzmaState(FixedArray literal_probabilities); - - u64 m_total_processed_bytes { 0 }; - - static constexpr size_t literal_probability_table_size = 0x300; - FixedArray m_literal_probabilities; - - struct LzmaLengthCoderState { - public: - LzmaLengthCoderState(); - - Probability m_first_choice_probability { default_probability }; - Probability m_second_choice_probability { default_probability }; - - static constexpr size_t maximum_number_of_position_bits = 4; - Array, (1 << maximum_number_of_position_bits)> m_low_length_probabilities; - Array, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities; - Array m_high_length_probabilities; - }; - - LzmaLengthCoderState m_length_coder; - LzmaLengthCoderState m_rep_length_coder; - - static constexpr u16 normalized_to_real_match_length_offset = 2; - static constexpr u32 normalized_to_real_match_distance_offset = 1; - - // According to the specification, the largest possible normalized match length is provided by the high coder, - // which processes 8 bits (0 to 255) and adds a displacement of 16 on top. - // This is the minimum size that our input buffer has to have to not miss any possible repetitions while encoding. - static constexpr u16 largest_real_match_length = 255 + 16 + normalized_to_real_match_length_offset; - - static constexpr u32 end_of_stream_marker = 0xFFFFFFFF; - - static constexpr size_t number_of_length_to_position_states = 4; - Array, number_of_length_to_position_states> m_length_to_position_states; - - static constexpr size_t first_position_slot_with_binary_tree_bits = 4; - static constexpr size_t first_position_slot_with_direct_encoded_bits = 14; - - // This is a bit wasteful on memory and not in the specification, but it makes the math easier. - static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits; - static constexpr size_t largest_number_of_binary_tree_distance_bits = 5; - Array, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities; - - static constexpr size_t number_of_alignment_bits = 4; - Array m_alignment_bit_probabilities; - - // LZ state tracking. - u16 m_state { 0 }; - u32 m_rep0 { 0 }; - u32 m_rep1 { 0 }; - u32 m_rep2 { 0 }; - u32 m_rep3 { 0 }; - u32 current_repetition_offset() const; - - void update_state_after_literal(); - void update_state_after_match(); - void update_state_after_rep(); - void update_state_after_short_rep(); - - static constexpr size_t maximum_number_of_position_bits = 4; - static constexpr size_t number_of_states = 12; - Array m_is_match_probabilities; - Array m_is_rep_probabilities; - Array m_is_rep_g0_probabilities; - Array m_is_rep_g1_probabilities; - Array m_is_rep_g2_probabilities; - Array m_is_rep0_long_probabilities; - - enum MatchType { - Literal, - SimpleMatch, - RepMatch0, - ShortRepMatch, - RepMatch1, - RepMatch2, - RepMatch3, - }; -}; - -class LzmaDecompressor : public Stream - , LzmaState { -public: - /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). - static ErrorOr> create_from_container(MaybeOwned, Optional> dictionary = {}); - - /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats). - static ErrorOr> create_from_raw_stream(MaybeOwned, LzmaDecompressorOptions const&, Optional> dictionary = {}); - - ErrorOr append_input_stream(MaybeOwned, Optional uncompressed_size); - - virtual ErrorOr read_some(Bytes) override; - virtual ErrorOr write_some(ReadonlyBytes) override; - virtual bool is_eof() const override; - virtual bool is_open() const override; - virtual void close() override; - -private: - LzmaDecompressor(MaybeOwned, LzmaDecompressorOptions, MaybeOwned, FixedArray literal_probabilities); - - MaybeOwned m_stream; - LzmaDecompressorOptions m_options; - - // This doubles as an output buffer, since we have to write all of our results into this anyways. - MaybeOwned m_dictionary; - bool m_found_end_of_stream_marker { false }; - bool is_range_decoder_in_clean_state() const; - bool has_reached_expected_data_size() const; - Optional m_leftover_match_length; - - // Range decoder state (initialized with stream data in LzmaDecompressor::create). - u32 m_range_decoder_range { 0xFFFFFFFF }; - u32 m_range_decoder_code { 0 }; - - ErrorOr initialize_range_decoder(); - ErrorOr normalize_range_decoder(); - ErrorOr decode_direct_bit(); - ErrorOr decode_bit_with_probability(Probability& probability); - - ErrorOr decode_match_type(); - - // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order). - // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size. - ErrorOr decode_symbol_using_bit_tree(size_t bit_count, Span probability_tree); - ErrorOr decode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree); - - ErrorOr decode_literal_to_output_buffer(); - - ErrorOr decode_normalized_match_length(LzmaLengthCoderState&); - - // This deviates from the specification, which states that "unsigned" is at least 16-bit. - // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value. - ErrorOr decode_normalized_match_distance(u16 normalized_match_length); -}; - -class LzmaCompressor : public Stream - , LzmaState { -public: - /// Creates a compressor for a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). - static ErrorOr> create_container(MaybeOwned, LzmaCompressorOptions const&); - - /// Finishes the archive by writing out the remaining data from the range coder. - ErrorOr flush(); - - virtual ErrorOr read_some(Bytes) override; - virtual ErrorOr write_some(ReadonlyBytes) override; - virtual bool is_eof() const override; - virtual bool is_open() const override; - virtual void close() override; - - virtual ~LzmaCompressor(); - -private: - LzmaCompressor(MaybeOwned, LzmaCompressorOptions, MaybeOwned, FixedArray literal_probabilities); - - ErrorOr shift_range_encoder(); - ErrorOr normalize_range_encoder(); - ErrorOr encode_direct_bit(u8 value); - ErrorOr encode_bit_with_probability(Probability&, u8 value); - ErrorOr encode_symbol_using_bit_tree(size_t bit_count, Span probability_tree, u16 value); - ErrorOr encode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree, u16 value); - ErrorOr encode_normalized_match_length(LzmaLengthCoderState&, u16 normalized_length); - ErrorOr encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance); - - ErrorOr encode_match_type(MatchType); - ErrorOr encode_literal(u8 literal); - ErrorOr encode_existing_match(size_t real_distance, size_t real_length); - ErrorOr encode_new_match(size_t real_distance, size_t real_length); - ErrorOr encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length); - - ErrorOr encode_once(); - - bool m_has_flushed_data { false }; - - MaybeOwned m_stream; - LzmaCompressorOptions m_options; - - // This doubles as an input buffer, which is appended at the very front of the buffer. - // Therefore, the size of this should at least be the dictionary size + the largest possible repetition length. - MaybeOwned m_dictionary; - - // Range encoder state. - u32 m_range_encoder_range { 0xFFFFFFFF }; - u64 m_range_encoder_code { 0 }; - - // Since the range is only 32-bits, we can overflow at most +1 into the next byte beyond the usual 32-bit code. - // Therefore, it is sufficient to store the highest byte (which may still change due to that +1 overflow) and - // the length of the chain of 0xFF bytes that may end up propagating that change. - u8 m_range_encoder_cached_byte { 0x00 }; - size_t m_range_encoder_ff_chain_length { 0 }; -}; - -} - -template<> -struct AK::Traits : public AK::DefaultTraits { - static constexpr bool is_trivially_serializable() { return true; } -}; diff --git a/Meta/CMake/all_the_debug_macros.cmake b/Meta/CMake/all_the_debug_macros.cmake index da83274de72..79368e293c9 100644 --- a/Meta/CMake/all_the_debug_macros.cmake +++ b/Meta/CMake/all_the_debug_macros.cmake @@ -33,7 +33,6 @@ set(LIBWEB_CSS_ANIMATION_DEBUG ON) set(LIBWEB_CSS_DEBUG ON) set(LIBWEB_WASM_DEBUG ON) set(LINE_EDITOR_DEBUG ON) -set(LZMA_DEBUG ON) set(LZW_DEBUG ON) set(MACH_PORT_DEBUG ON) set(MATROSKA_DEBUG ON) diff --git a/Meta/Lagom/Fuzzers/FuzzLzmaDecompression.cpp b/Meta/Lagom/Fuzzers/FuzzLzmaDecompression.cpp deleted file mode 100644 index 79ae10c3db6..00000000000 --- a/Meta/Lagom/Fuzzers/FuzzLzmaDecompression.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2023, Tim Schumacher . - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include - -extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size) -{ - AK::set_debug_enabled(false); - - // LibFuzzer has a default memory limit of 2048 MB, so limit the dictionary size to a - // reasonable number to make sure that we don't actually run into it by allocating a - // huge dictionary. The chosen value is double of what the largest dictionary in the - // specifications test files is, so it should be more than enough for fuzzing everything - // that we would want to fuzz. - constexpr size_t largest_reasonable_dictionary_size = 16 * MiB; - - if (size >= sizeof(Compress::LzmaHeader)) { - auto const* header = reinterpret_cast(data); - if (header->dictionary_size() > largest_reasonable_dictionary_size) - return -1; - } - - auto stream = make(ReadonlyBytes { data, size }); - auto decompressor_or_error = Compress::LzmaDecompressor::create_from_container(move(stream)); - if (decompressor_or_error.is_error()) - return 0; - auto decompressor = decompressor_or_error.release_value(); - while (!decompressor->is_eof()) { - auto maybe_error = decompressor->discard(4096); - if (maybe_error.is_error()) - break; - } - return 0; -} diff --git a/Meta/Lagom/Fuzzers/FuzzLzmaRoundtrip.cpp b/Meta/Lagom/Fuzzers/FuzzLzmaRoundtrip.cpp deleted file mode 100644 index 9b859eb1b24..00000000000 --- a/Meta/Lagom/Fuzzers/FuzzLzmaRoundtrip.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2023, Tim Schumacher . - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include - -extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size) -{ - AK::set_debug_enabled(false); - - AllocatingMemoryStream stream {}; - - auto compressor = MUST(Compress::LzmaCompressor::create_container(MaybeOwned { stream }, {})); - MUST(compressor->write_until_depleted({ data, size })); - MUST(compressor->flush()); - - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(MaybeOwned { stream })); - auto result = MUST(decompressor->read_until_eof()); - - VERIFY((ReadonlyBytes { data, size }) == result.span()); - - return 0; -} diff --git a/Meta/Lagom/Fuzzers/fuzzers.cmake b/Meta/Lagom/Fuzzers/fuzzers.cmake index 67744741823..118a992b593 100644 --- a/Meta/Lagom/Fuzzers/fuzzers.cmake +++ b/Meta/Lagom/Fuzzers/fuzzers.cmake @@ -12,8 +12,6 @@ set(FUZZER_TARGETS JPEGLoader Js JsonParser - LzmaDecompression - LzmaRoundtrip MatroskaReader MD5 PEM @@ -56,8 +54,6 @@ set(FUZZER_DEPENDENCIES_GzipRoundtrip LibCompress) set(FUZZER_DEPENDENCIES_ICOLoader LibGfx) set(FUZZER_DEPENDENCIES_JPEGLoader LibGfx) set(FUZZER_DEPENDENCIES_Js LibJS LibGC) -set(FUZZER_DEPENDENCIES_LzmaDecompression LibCompress) -set(FUZZER_DEPENDENCIES_LzmaRoundtrip LibCompress) set(FUZZER_DEPENDENCIES_MatroskaReader LibMedia) set(FUZZER_DEPENDENCIES_MD5 LibCrypto) set(FUZZER_DEPENDENCIES_PEM LibCrypto) diff --git a/Meta/gn/secondary/AK/BUILD.gn b/Meta/gn/secondary/AK/BUILD.gn index b39976a1cf7..c9daf40a10c 100644 --- a/Meta/gn/secondary/AK/BUILD.gn +++ b/Meta/gn/secondary/AK/BUILD.gn @@ -254,7 +254,6 @@ write_cmake_config("ak_debug_gen") { "LIBWEB_CSS_DEBUG=", "LIBWEB_WASM_DEBUG=", "LINE_EDITOR_DEBUG=", - "LZMA_DEBUG=", "LZW_DEBUG=", "MACH_PORT_DEBUG=", "MATROSKA_DEBUG=", diff --git a/Meta/gn/secondary/Userland/Libraries/LibCompress/BUILD.gn b/Meta/gn/secondary/Userland/Libraries/LibCompress/BUILD.gn index ce48404f72f..3a1a94a3eb1 100644 --- a/Meta/gn/secondary/Userland/Libraries/LibCompress/BUILD.gn +++ b/Meta/gn/secondary/Userland/Libraries/LibCompress/BUILD.gn @@ -4,7 +4,6 @@ shared_library("LibCompress") { sources = [ "Deflate.cpp", "Gzip.cpp", - "Lzma.cpp", "PackBitsDecoder.cpp", "Zlib.cpp", ] diff --git a/Tests/LibCompress/CMakeLists.txt b/Tests/LibCompress/CMakeLists.txt index 1dd4c8bd672..aec1cadddef 100644 --- a/Tests/LibCompress/CMakeLists.txt +++ b/Tests/LibCompress/CMakeLists.txt @@ -1,7 +1,6 @@ set(TEST_SOURCES TestDeflate.cpp TestGzip.cpp - TestLzma.cpp TestLzw.cpp TestPackBits.cpp TestZlib.cpp diff --git a/Tests/LibCompress/TestLzma.cpp b/Tests/LibCompress/TestLzma.cpp deleted file mode 100644 index 67f2a0d9f2e..00000000000 --- a/Tests/LibCompress/TestLzma.cpp +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2023, Tim Schumacher - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include - -#include -#include - -TEST_CASE(repetition_length_beyond_distance) -{ - // This test exists to ensure correctness when repeating data from the dictionary that has been - // written earlier during the same repetition. - // While this test case is not large enough to testify how well this is optimized, it may still - // be a constellation that is improperly implemented as a whole. - - Array const compressed { - 0x5D, // Model properties (lc = 3, lp = 0, pb = 2) - 0x00, 0x10, 0x00, 0x00, // Dictionary size (4 KB) - 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Uncompressed size (7) - - // Encode a literal 'A' and a literal 'B', followed by a repetition from (real) distance 2 with a (real) length of 5. - 0x00, 0x20, 0x90, 0x9F, 0x04, 0x00, 0x00, 0x00 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer = TRY_OR_FAIL(decompressor->read_until_eof(PAGE_SIZE)); - EXPECT_EQ("ABABABA"sv.bytes(), buffer.span()); -} - -TEST_CASE(compress_decompress_roundtrip_with_known_size) -{ - auto const uncompressed = "Well hello friends, this is a simple text file :)"sv.bytes(); - - auto stream = MUST(try_make()); - - Compress::LzmaCompressorOptions const compressor_options { - .literal_context_bits = 3, - .literal_position_bits = 0, - .position_bits = 2, - .dictionary_size = 4 * KiB, - .uncompressed_size = uncompressed.size(), - }; - auto compressor = TRY_OR_FAIL(Compress::LzmaCompressor::create_container(MaybeOwned { *stream }, compressor_options)); - TRY_OR_FAIL(compressor->write_until_depleted(uncompressed)); - - auto decompressor = TRY_OR_FAIL(Compress::LzmaDecompressor::create_from_container(MaybeOwned { *stream })); - auto result = TRY_OR_FAIL(decompressor->read_until_eof()); - - EXPECT_EQ(uncompressed, result.span()); -} - -TEST_CASE(compress_decompress_roundtrip_with_unknown_size) -{ - auto const uncompressed = "Well hello friends, this is a simple text file :)"sv.bytes(); - - auto stream = MUST(try_make()); - - Compress::LzmaCompressorOptions const compressor_options { - .literal_context_bits = 3, - .literal_position_bits = 0, - .position_bits = 2, - .dictionary_size = 4 * KiB, - }; - auto compressor = TRY_OR_FAIL(Compress::LzmaCompressor::create_container(MaybeOwned { *stream }, compressor_options)); - TRY_OR_FAIL(compressor->write_until_depleted(uncompressed)); - TRY_OR_FAIL(compressor->flush()); - - auto decompressor = TRY_OR_FAIL(Compress::LzmaDecompressor::create_from_container(MaybeOwned { *stream })); - auto result = TRY_OR_FAIL(decompressor->read_until_eof()); - - EXPECT_EQ(uncompressed, result.span()); -} - -TEST_CASE(compress_long_overflow_chain) -{ - // Encoding 0xFF followed by the end-of-stream marker results in a chain of bytes that doesn't fit into 64 bits, - // which breaks naive implementations of "hold back the byte until it no longer changes". - - Array const uncompressed { - 0xFF - }; - - auto stream = MUST(try_make()); - auto compressor = TRY_OR_FAIL(Compress::LzmaCompressor::create_container(MaybeOwned { *stream }, {})); - TRY_OR_FAIL(compressor->write_until_depleted(uncompressed)); - TRY_OR_FAIL(compressor->flush()); - - auto decompressor = TRY_OR_FAIL(Compress::LzmaDecompressor::create_from_container(MaybeOwned { *stream })); - auto result = TRY_OR_FAIL(decompressor->read_until_eof()); - - EXPECT_EQ(uncompressed, result.span()); -} - -// The following tests are based on test files from the LZMA specification, which has been placed in the public domain. -// LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z - -Array const specification_a_txt { - 0x4C, 0x5A, 0x4D, 0x41, 0x20, 0x64, 0x65, 0x63, 0x6F, 0x64, 0x65, 0x72, 0x20, 0x74, 0x65, 0x73, - 0x74, 0x20, 0x65, 0x78, 0x61, 0x6D, 0x70, 0x6C, 0x65, 0x0D, 0x0A, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x0D, 0x0A, 0x21, 0x20, 0x4C, 0x5A, 0x4D, 0x41, 0x20, 0x21, 0x20, 0x44, - 0x65, 0x63, 0x6F, 0x64, 0x65, 0x72, 0x20, 0x21, 0x20, 0x54, 0x45, 0x53, 0x54, 0x20, 0x21, 0x0D, - 0x0A, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x0D, 0x0A, 0x21, 0x20, 0x54, 0x45, - 0x53, 0x54, 0x20, 0x21, 0x20, 0x4C, 0x5A, 0x4D, 0x41, 0x20, 0x21, 0x20, 0x44, 0x65, 0x63, 0x6F, - 0x64, 0x65, 0x72, 0x20, 0x21, 0x0D, 0x0A, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x0D, 0x0A, 0x2D, 0x2D, 0x2D, 0x2D, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x4C, 0x69, 0x6E, 0x65, - 0x20, 0x31, 0x20, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x20, 0x0D, 0x0A, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x0D, 0x0A, 0x2D, 0x2D, 0x2D, 0x2D, 0x20, 0x54, 0x65, - 0x73, 0x74, 0x20, 0x4C, 0x69, 0x6E, 0x65, 0x20, 0x32, 0x20, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, 0x2D, - 0x2D, 0x2D, 0x20, 0x0D, 0x0A, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x0D, 0x0A, - 0x3D, 0x3D, 0x3D, 0x20, 0x45, 0x6E, 0x64, 0x20, 0x6F, 0x66, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, - 0x66, 0x69, 0x6C, 0x65, 0x20, 0x3D, 0x3D, 0x3D, 0x3D, 0x20, 0x0D, 0x0A, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, - 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x0D, 0x0A -}; - -TEST_CASE(specification_a_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x80, 0x00, 0x47, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0x9F, 0xAB, 0x70, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x89, 0xFE, 0x05, 0x36, 0x80 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer = TRY_OR_FAIL(decompressor->read_until_eof(PAGE_SIZE)); - EXPECT_EQ(specification_a_txt, buffer.span()); -} - -TEST_CASE(specification_a_eos_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0x9F, 0xAB, 0x70, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x8A, 0x08, 0xF5, 0x99, 0x8D, 0x7F, 0xFA, 0x18, 0x0A, 0x52 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer = TRY_OR_FAIL(decompressor->read_until_eof(PAGE_SIZE)); - EXPECT_EQ(specification_a_txt, buffer.span()); -} - -TEST_CASE(specification_a_eos_and_size_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x01, 0x00, 0x47, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0x9F, 0xAB, 0x70, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x8A, 0x08, 0xF5, 0x99, 0x8D, 0x7F, 0xFA, 0x18, 0x0A, 0x52 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer = TRY_OR_FAIL(decompressor->read_until_eof(PAGE_SIZE)); - EXPECT_EQ(specification_a_txt, buffer.span()); -} - -TEST_CASE(specification_a_lp1_lc2_pb1_lzma_decompress) -{ - // Note: The name of this test file (and the accompanying info.txt) is wrong. It is encoded with lc = 1 instead of lc = 2. - Array const compressed { - 0x37, 0x00, 0x00, 0x01, 0x00, 0x47, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x86, 0x23, 0xBC, 0x5C, 0xC9, 0x40, 0x2B, 0x6B, 0x91, 0x5B, 0xCD, 0x90, 0x40, 0xCB, 0x9A, 0x71, - 0x5B, 0x84, 0x68, 0xE0, 0x5A, 0xAB, 0xA3, 0xE9, 0x04, 0xF7, 0xA3, 0xA6, 0x8E, 0x5F, 0xAA, 0x24, - 0x8B, 0xFC, 0x20, 0x38, 0xA6, 0xB7, 0x2A, 0x47, 0xAF, 0x07, 0xF7, 0x14, 0xAC, 0xE8, 0xB4, 0xD9, - 0x96, 0x27, 0xE0, 0xF4, 0x47, 0x8D, 0xE9, 0xDD, 0x05, 0x28, 0x1A, 0xDF, 0xB1, 0xED, 0x1A, 0xDC, - 0x0B, 0x55, 0xB2, 0xBD, 0x55, 0x69, 0x6C, 0xD9, 0xFC, 0x70, 0x43, 0xA7, 0x16, 0x58, 0x99, 0xFE, - 0x97, 0x04, 0x11, 0x27, 0x56, 0x5E, 0xC6, 0xB0, 0x4E, 0x31, 0xA0, 0xCB, 0x17, 0x27, 0xEC, 0x72, - 0x36, 0x0E, 0x9A, 0xAD, 0x00 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer = TRY_OR_FAIL(decompressor->read_until_eof(PAGE_SIZE)); - EXPECT_EQ(specification_a_txt, buffer.span()); -} - -TEST_CASE(specification_bad_corrupted_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x80, 0x00, 0x47, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0xFF, 0xFF, 0xFF, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x89, 0xFE, 0x05, 0x36, 0x80 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer_or_error = decompressor->read_until_eof(PAGE_SIZE); - EXPECT(buffer_or_error.is_error()); -} - -TEST_CASE(specification_bad_eos_incorrect_size_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x01, 0x00, 0x48, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0x9F, 0xAB, 0x70, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x8A, 0x08, 0xF5, 0x99, 0x8D, 0x7F, 0xFA, 0x18, 0x0A, 0x52 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer_or_error = decompressor->read_until_eof(PAGE_SIZE); - EXPECT(buffer_or_error.is_error()); -} - -TEST_CASE(specification_bad_incorrect_size_lzma_decompress) -{ - Array const compressed { - 0x5D, 0x00, 0x00, 0x80, 0x00, 0x22, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0x16, - 0x85, 0xBC, 0x45, 0xF0, 0xDF, 0xFF, 0xD2, 0xE8, 0x41, 0xF5, 0xCE, 0xE5, 0x90, 0xE1, 0xC8, 0x20, - 0xEA, 0xC6, 0x37, 0xBE, 0x2B, 0xD1, 0xF4, 0xC3, 0x34, 0x6F, 0x2F, 0x83, 0xC2, 0xA6, 0x7C, 0x6F, - 0x3D, 0x88, 0xA0, 0x58, 0x22, 0x1F, 0x3A, 0xBA, 0x7B, 0xC6, 0xDD, 0x66, 0xFE, 0xF8, 0x92, 0xE4, - 0xCB, 0x1C, 0xC4, 0x19, 0x0A, 0x0C, 0x8B, 0x2E, 0x39, 0xB8, 0xB8, 0x03, 0xCD, 0x5A, 0x9E, 0x10, - 0x3A, 0x4F, 0x65, 0xFA, 0x41, 0xCB, 0xF2, 0x79, 0x65, 0xD7, 0xF1, 0x9F, 0xAB, 0x70, 0x1D, 0x6F, - 0xF7, 0xB6, 0x79, 0xCC, 0x8A, 0x7D, 0xCE, 0xDB, 0xF8, 0xF6, 0x9E, 0xC9, 0x12, 0x9F, 0xAA, 0xBF, - 0x89, 0xFE, 0x05, 0x36, 0x80 - }; - - auto stream = MUST(try_make(compressed)); - auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream))); - auto buffer_or_error = decompressor->read_until_eof(PAGE_SIZE); - EXPECT(buffer_or_error.is_error()); -}