LibPDF: Add a basic parser and Document structure

This commit adds a parser as well as the Reader class, which serves
as a utility to aid in reading the PDF both forwards and in reverse.
The parser currently is capable of reading xref tables, as well as
all values. We don't really do anything with any of this information,
however.
This commit is contained in:
Matthew Olsson 2021-04-30 18:33:13 -07:00 committed by Andreas Kling
parent a8f5b6aaa3
commit 72f693e9ed
Notes: sideshowbarker 2024-07-18 18:23:52 +09:00
7 changed files with 1008 additions and 0 deletions

View file

@ -1,5 +1,7 @@
set(SOURCES
Object.cpp
Document.cpp
Parser.cpp
Value.cpp
)

View file

@ -0,0 +1,22 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibPDF/Document.h>
#include <LibPDF/Parser.h>
namespace PDF {
Document::Document(const ReadonlyBytes& bytes)
: m_parser(Parser({}, bytes))
{
VERIFY(m_parser.perform_validation());
auto [xref_table, trailer] = m_parser.parse_last_xref_table_and_trailer();
m_xref_table = xref_table;
m_trailer = trailer;
}
}

View file

@ -0,0 +1,42 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/RefCounted.h>
#include <LibPDF/Object.h>
#include <LibPDF/Parser.h>
#include <LibPDF/XRefTable.h>
namespace PDF {
class Document final : public RefCounted<Document> {
public:
explicit Document(const ReadonlyBytes& bytes);
ALWAYS_INLINE const XRefTable& xref_table() const { return m_xref_table; }
ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; }
ALWAYS_INLINE Value get_value(u32 index) const
{
return m_values.get(index).value_or({});
}
ALWAYS_INLINE void set_value(u32 index, const Value& value)
{
m_values.ensure_capacity(index);
m_values.set(index, value);
}
private:
Parser m_parser;
XRefTable m_xref_table;
RefPtr<DictObject> m_trailer;
HashMap<u32, Value> m_values;
};
}

View file

@ -0,0 +1,620 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/ScopeGuard.h>
#include <AK/TypeCasts.h>
#include <LibPDF/Document.h>
#include <LibPDF/Parser.h>
#include <ctype.h>
#include <math.h>
namespace PDF {
template<typename T, typename... Args>
static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
{
return adopt_ref(*new T(forward<Args>(args)...));
}
Parser::Parser(Badge<Document>, const ReadonlyBytes& bytes)
: m_reader(bytes)
{
}
bool Parser::perform_validation()
{
return !sloppy_is_linearized() && parse_header();
}
Parser::XRefTableAndTrailer Parser::parse_last_xref_table_and_trailer()
{
m_reader.move_to(m_reader.bytes().size() - 1);
VERIFY(navigate_to_before_eof_marker());
navigate_to_after_startxref();
VERIFY(!m_reader.done());
m_reader.set_reading_forwards();
auto xref_offset_value = parse_number();
VERIFY(xref_offset_value.is_int());
auto xref_offset = xref_offset_value.as_int();
m_reader.move_to(xref_offset);
auto xref_table = parse_xref_table();
auto trailer = parse_file_trailer();
return { xref_table, trailer };
}
bool Parser::parse_header()
{
// FIXME: Do something with the version?
m_reader.set_reading_forwards();
m_reader.move_to(0);
if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
return false;
m_reader.move_by(5);
char major_ver = m_reader.read();
if (major_ver != '1' && major_ver != '2')
return false;
if (m_reader.read() != '.')
return false;
char minor_ver = m_reader.read();
if (minor_ver < '0' || major_ver > '7')
return false;
consume_eol();
// Parse optional high-byte comment, which signifies a binary file
// FIXME: Do something with this?
auto comment = parse_comment();
if (!comment.is_empty()) {
auto binary = comment.length() >= 4;
if (binary) {
for (size_t i = 0; i < comment.length() && binary; i++)
binary = static_cast<u8>(comment[i]) > 128;
}
}
return true;
}
XRefTable Parser::parse_xref_table()
{
VERIFY(m_reader.matches("xref"));
m_reader.move_by(4);
consume_eol();
XRefTable table;
while (true) {
if (m_reader.matches("trailer"))
break;
Vector<XRefEntry> entries;
auto starting_index_value = parse_number();
auto starting_index = starting_index_value.as_int();
auto object_count_value = parse_number();
auto object_count = object_count_value.as_int();
for (int i = 0; i < object_count; i++) {
auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
m_reader.move_by(10);
consume(' ');
auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
m_reader.move_by(5);
consume(' ');
auto letter = m_reader.read();
VERIFY(letter == 'n' || letter == 'f');
// The line ending sequence can be one of the following:
// SP CR, SP LF, or CR LF
if (m_reader.matches(' ')) {
consume();
auto ch = consume();
VERIFY(ch == '\r' || ch == '\n');
} else {
VERIFY(m_reader.matches("\r\n"));
m_reader.move_by(2);
}
auto offset = strtol(offset_string.characters(), nullptr, 10);
auto generation = strtol(generation_string.characters(), nullptr, 10);
entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
}
table.add_section({ starting_index, object_count, entries });
}
return table;
}
NonnullRefPtr<DictObject> Parser::parse_file_trailer()
{
VERIFY(m_reader.matches("trailer"));
m_reader.move_by(7);
consume_whitespace();
auto dict = parse_dict();
VERIFY(m_reader.matches("startxref"));
m_reader.move_by(9);
consume_whitespace();
m_reader.move_until([&](auto) { return matches_eol(); });
consume_eol();
VERIFY(m_reader.matches("%%EOF"));
m_reader.move_by(5);
consume_whitespace();
VERIFY(m_reader.done());
return dict;
}
bool Parser::navigate_to_before_eof_marker()
{
m_reader.set_reading_backwards();
while (!m_reader.done()) {
m_reader.move_until([&](auto) { return matches_eol(); });
if (m_reader.done())
return false;
consume_eol();
if (!m_reader.matches("%%EOF"))
continue;
m_reader.move_by(5);
if (!matches_eol())
continue;
consume_eol();
return true;
}
return false;
}
bool Parser::navigate_to_after_startxref()
{
m_reader.set_reading_backwards();
while (!m_reader.done()) {
m_reader.move_until([&](auto) { return matches_eol(); });
auto offset = m_reader.offset() + 1;
consume_eol();
if (!m_reader.matches("startxref"))
continue;
m_reader.move_by(9);
if (!matches_eol())
continue;
m_reader.move_to(offset);
return true;
}
return false;
}
bool Parser::sloppy_is_linearized()
{
ScopeGuard guard([&] {
m_reader.move_to(0);
m_reader.set_reading_forwards();
});
auto limit = min(1024ul, m_reader.bytes().size() - 1);
m_reader.move_to(limit);
m_reader.set_reading_backwards();
while (!m_reader.done()) {
m_reader.move_until('/');
if (m_reader.matches("/Linearized"))
return true;
m_reader.move_by(1);
}
return false;
}
String Parser::parse_comment()
{
if (!m_reader.matches('%'))
return {};
consume();
auto comment_start_offset = m_reader.offset();
m_reader.move_until([&] {
return matches_eol();
});
String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
consume_eol();
consume_whitespace();
return str;
}
Value Parser::parse_value()
{
parse_comment();
if (m_reader.matches("null")) {
m_reader.move_by(4);
consume_whitespace();
return Value();
}
if (m_reader.matches("true")) {
m_reader.move_by(4);
consume_whitespace();
return Value(true);
}
if (m_reader.matches("false")) {
m_reader.move_by(5);
consume_whitespace();
return Value(false);
}
if (matches_number())
return parse_possible_indirect_value_or_ref();
if (m_reader.matches('/'))
return parse_name();
if (m_reader.matches("<<")) {
auto dict = parse_dict();
if (m_reader.matches("stream\n"))
return parse_stream(dict);
return dict;
}
if (m_reader.matches_any('(', '<'))
return parse_string();
if (m_reader.matches('['))
return parse_array();
dbgln("tried to parse value, but found char {} ({}) at offset {}", m_reader.peek(), static_cast<u8>(m_reader.peek()), m_reader.offset());
VERIFY_NOT_REACHED();
}
Value Parser::parse_possible_indirect_value_or_ref()
{
auto first_number = parse_number();
if (!first_number.is_int() || !matches_number())
return first_number;
m_reader.save();
auto second_number = parse_number();
if (!second_number.is_int()) {
m_reader.load();
return first_number;
}
if (m_reader.matches('R')) {
m_reader.discard();
consume();
consume_whitespace();
return make_object<IndirectValueRef>(first_number.as_int(), second_number.as_int());
}
if (m_reader.matches("obj")) {
m_reader.discard();
return parse_indirect_value(first_number.as_int(), second_number.as_int());
}
m_reader.load();
return first_number;
}
NonnullRefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generation)
{
VERIFY(m_reader.matches("obj"));
m_reader.move_by(3);
if (matches_eol())
consume_eol();
auto value = parse_value();
VERIFY(value.is_object());
VERIFY(m_reader.matches("endobj"));
VERIFY(consume_whitespace());
return make_object<IndirectValue>(index, generation, value.as_object());
}
Value Parser::parse_number()
{
size_t start_offset = m_reader.offset();
bool is_float = false;
if (m_reader.matches('+') || m_reader.matches('-'))
consume();
while (!m_reader.done()) {
if (m_reader.matches('.')) {
if (is_float)
break;
is_float = true;
consume();
} else if (isdigit(m_reader.peek())) {
consume();
} else {
break;
}
}
auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
float f = strtof(string.characters(), nullptr);
if (is_float)
return Value(f);
VERIFY(floorf(f) == f);
consume_whitespace();
return Value(static_cast<int>(f));
}
NonnullRefPtr<NameObject> Parser::parse_name()
{
consume('/');
StringBuilder builder;
while (true) {
if (matches_whitespace())
break;
if (m_reader.matches('#')) {
int hex_value = 0;
for (int i = 0; i < 2; i++) {
auto ch = consume();
VERIFY(isxdigit(ch));
hex_value *= 16;
if (ch <= '9') {
hex_value += ch - '0';
} else {
hex_value += ch - 'A' + 10;
}
}
builder.append(static_cast<char>(hex_value));
continue;
}
builder.append(consume());
}
consume_whitespace();
return make_object<NameObject>(builder.to_string());
}
NonnullRefPtr<StringObject> Parser::parse_string()
{
ScopeGuard guard([&] { consume_whitespace(); });
if (m_reader.matches('('))
return make_object<StringObject>(parse_literal_string(), false);
return make_object<StringObject>(parse_hex_string(), true);
}
String Parser::parse_literal_string()
{
consume('(');
StringBuilder builder;
auto opened_parens = 0;
while (true) {
if (m_reader.matches('(')) {
opened_parens++;
builder.append(consume());
} else if (m_reader.matches(')')) {
consume();
if (opened_parens == 0)
break;
opened_parens--;
builder.append(')');
} else if (m_reader.matches('\\')) {
consume();
if (matches_eol()) {
consume_eol();
continue;
}
VERIFY(!m_reader.done());
auto ch = consume();
switch (ch) {
case 'n':
builder.append('\n');
break;
case 'r':
builder.append('\r');
break;
case 't':
builder.append('\t');
break;
case 'b':
builder.append('\b');
break;
case 'f':
builder.append('\f');
break;
case '(':
builder.append('(');
break;
case ')':
builder.append(')');
break;
case '\\':
builder.append('\\');
break;
default: {
if (ch >= '0' && ch <= '7') {
int octal_value = ch - '0';
for (int i = 0; i < 2; i++) {
auto octal_ch = consume();
if (octal_ch < '0' || octal_ch > '7')
break;
octal_value = octal_value * 8 + (octal_ch - '0');
}
builder.append(static_cast<char>(octal_value));
} else {
builder.append(ch);
}
}
}
} else if (matches_eol()) {
consume_eol();
builder.append('\n');
} else {
builder.append(consume());
}
}
VERIFY(opened_parens == 0);
return builder.to_string();
}
String Parser::parse_hex_string()
{
consume('<');
StringBuilder builder;
while (true) {
if (m_reader.matches('>')) {
consume();
return builder.to_string();
} else {
int hex_value = 0;
for (int i = 0; i < 2; i++) {
auto ch = consume();
if (ch == '>') {
// The hex string contains an odd number of characters, and the last character
// is assumed to be '0'
consume();
hex_value *= 16;
builder.append(static_cast<char>(hex_value));
return builder.to_string();
}
VERIFY(isxdigit(ch));
hex_value *= 16;
if (ch <= '9') {
hex_value += ch - '0';
} else {
hex_value += ch - 'A' + 10;
}
}
builder.append(static_cast<char>(hex_value));
}
}
}
NonnullRefPtr<ArrayObject> Parser::parse_array()
{
consume('[');
consume_whitespace();
Vector<Value> values;
while (!m_reader.matches(']'))
values.append(parse_value());
consume(']');
consume_whitespace();
return make_object<ArrayObject>(values);
}
NonnullRefPtr<DictObject> Parser::parse_dict()
{
consume('<');
consume('<');
consume_whitespace();
HashMap<FlyString, Value> map;
while (true) {
if (m_reader.matches(">>"))
break;
auto name = parse_name();
auto value = parse_value();
map.set(name->name(), value);
}
consume('>');
consume('>');
consume_whitespace();
return make_object<DictObject>(map);
}
NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
{
VERIFY(m_reader.matches("stream"));
m_reader.move_by(6);
consume_eol();
auto length_value = dict->map().get("Length");
VERIFY(length_value.has_value());
auto length = length_value.value();
VERIFY(length.is_int());
auto bytes = m_reader.bytes().slice(m_reader.offset(), length.as_int());
return make_object<StreamObject>(dict, bytes);
}
bool Parser::matches_eol() const
{
return m_reader.matches_any(0xa, 0xd);
}
bool Parser::matches_whitespace() const
{
return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
}
bool Parser::matches_number() const
{
if (m_reader.done())
return false;
auto ch = m_reader.peek();
return isdigit(ch) || ch == '-' || ch == '+';
}
void Parser::consume_eol()
{
if (m_reader.matches("\r\n")) {
consume(2);
} else {
auto consumed = consume();
VERIFY(consumed == 0xd || consumed == 0xa);
}
}
bool Parser::consume_whitespace()
{
bool consumed = false;
while (matches_whitespace()) {
consumed = true;
consume();
}
return consumed;
}
char Parser::consume()
{
return m_reader.read();
}
void Parser::consume(char ch)
{
VERIFY(consume() == ch);
}
}

View file

@ -0,0 +1,72 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullRefPtrVector.h>
#include <LibPDF/Object.h>
#include <LibPDF/Reader.h>
#include <LibPDF/XRefTable.h>
namespace PDF {
class Document;
class Parser {
public:
Parser(Badge<Document>, const ReadonlyBytes&);
bool perform_validation();
struct XRefTableAndTrailer {
XRefTable xref_table;
NonnullRefPtr<DictObject> trailer;
};
XRefTableAndTrailer parse_last_xref_table_and_trailer();
private:
bool parse_header();
XRefTable parse_xref_table();
NonnullRefPtr<DictObject> parse_file_trailer();
bool navigate_to_before_eof_marker();
bool navigate_to_after_startxref();
// If the PDF is linearized, the first object will be the linearization
// parameter dictionary, and it will always occur within the first 1024 bytes.
// We do a very sloppy and context-free search for this object. A return value
// of true does not necessarily mean this PDF is linearized, but a return value
// of false does mean this PDF is not linearized.
// FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
bool sloppy_is_linearized();
String parse_comment();
Value parse_value();
Value parse_possible_indirect_value_or_ref();
NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
Value parse_number();
NonnullRefPtr<NameObject> parse_name();
NonnullRefPtr<StringObject> parse_string();
String parse_literal_string();
String parse_hex_string();
NonnullRefPtr<ArrayObject> parse_array();
NonnullRefPtr<DictObject> parse_dict();
NonnullRefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);
bool matches_eol() const;
bool matches_whitespace() const;
bool matches_number() const;
void consume_eol();
bool consume_whitespace();
char consume();
void consume(char);
Reader m_reader;
};
}

View file

@ -0,0 +1,154 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Debug.h>
#include <AK/Function.h>
#include <AK/ScopeGuard.h>
namespace PDF {
class Reader {
public:
explicit Reader(const ReadonlyBytes& bytes)
: m_bytes(bytes)
{
}
ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
ALWAYS_INLINE size_t offset() const { return m_offset; }
bool done() const
{
if (m_forwards)
return offset() >= bytes().size();
return m_offset < 0;
}
size_t remaining() const
{
if (done())
return 0;
if (m_forwards)
return bytes().size() - offset() - 1;
return offset() + 1;
}
void move_by(size_t count)
{
if (m_forwards) {
m_offset += static_cast<ssize_t>(count);
} else {
m_offset -= static_cast<ssize_t>(count);
}
}
char read()
{
auto value = m_bytes.at(m_offset);
move_by(1);
return static_cast<char>(value);
}
char peek(size_t shift = 0) const
{
auto offset = m_offset + shift * (m_forwards ? 1 : -1);
return static_cast<char>(m_bytes.at(offset));
}
template<typename... T>
bool matches_any(T... elements) const
{
if (done())
return false;
auto ch = peek();
return ((ch == elements) || ...);
}
bool matches(char ch) const
{
return !done() && peek() == ch;
}
bool matches(const char* chars) const
{
String string(chars);
if (remaining() < string.length())
return false;
if (!m_forwards)
string = string.reverse();
for (size_t i = 0; i < string.length(); i++) {
if (peek(i) != string[i])
return false;
}
return true;
}
template<typename T = char>
void move_to(size_t offset)
{
VERIFY(offset < m_bytes.size());
m_offset = static_cast<ssize_t>(offset);
}
void move_until(char ch)
{
while (!done() && peek() != ch)
move_by(1);
}
void move_until(Function<bool(char)> predicate)
{
while (!done() && !predicate(peek()))
move_by(1);
}
ALWAYS_INLINE void move_while(Function<bool(char)> predicate)
{
move_until([&predicate](char t) { return !predicate(t); });
}
ALWAYS_INLINE void set_reading_forwards() { m_forwards = true; }
ALWAYS_INLINE void set_reading_backwards() { m_forwards = false; }
ALWAYS_INLINE void save() { m_saved_offsets.append(m_offset); }
ALWAYS_INLINE void load() { m_offset = m_saved_offsets.take_last(); }
ALWAYS_INLINE void discard() { m_saved_offsets.take_last(); }
void dump_state()
{
StringBuilder builder;
builder.append("Reader State Dump\n\n");
size_t from = max(0ul, offset() - 10);
size_t to = min(bytes().size() - 1, offset() + 10);
for (auto i = from; i <= to; i++) {
char value = static_cast<char>(bytes().at(i));
builder.appendff("{}: '{}' (value={:3d}) ", i, value, static_cast<u8>(value));
if (i == offset())
builder.appendff(" <<< current location, forwards={}", m_forwards);
builder.append('\n');
}
builder.append('\n');
auto str = builder.to_string();
dbgputstr(str.characters(), str.length());
}
private:
ReadonlyBytes m_bytes;
ssize_t m_offset { 0 };
Vector<ssize_t> m_saved_offsets;
bool m_forwards { true };
};
}

View file

@ -0,0 +1,96 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Format.h>
namespace PDF {
struct XRefEntry {
long byte_offset { -1L };
u16 generation_number { 0 };
bool in_use { false };
};
struct XRefSection {
int starting_index;
int count;
Vector<XRefEntry> entries;
};
class XRefTable {
public:
void add_section(const XRefSection& section)
{
m_entries.ensure_capacity(section.starting_index + section.count);
for (int i = static_cast<int>(m_entries.size()); i < section.starting_index; i++)
m_entries.append(XRefEntry {});
for (auto& entry : section.entries)
m_entries.append(entry);
}
[[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
{
return index < m_entries.size() && m_entries[index].byte_offset != -1;
}
[[nodiscard]] ALWAYS_INLINE long byte_offset_for_object(size_t index) const
{
VERIFY(has_object(index));
return m_entries[index].byte_offset;
}
[[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const
{
VERIFY(has_object(index));
return m_entries[index].generation_number;
}
[[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const
{
VERIFY(has_object(index));
return m_entries[index].in_use;
}
private:
friend struct AK::Formatter<PDF::XRefTable>;
Vector<XRefEntry> m_entries;
};
}
namespace AK {
template<>
struct Formatter<PDF::XRefEntry> : Formatter<StringView> {
void format(FormatBuilder& builder, const PDF::XRefEntry& entry)
{
Formatter<StringView>::format(builder,
String::formatted("XRefEntry {{ offset={} generation={} used={} }}",
entry.byte_offset,
entry.generation_number,
entry.in_use));
}
};
template<>
struct Formatter<PDF::XRefTable> : Formatter<StringView> {
void format(FormatBuilder& format_builder, const PDF::XRefTable& table)
{
StringBuilder builder;
builder.append("XRefTable {");
for (auto& entry : table.m_entries)
builder.appendff("\n {}", entry);
builder.append("\n}");
Formatter<StringView>::format(format_builder, builder.to_string());
}
};
}