From 3fe9f8e48dfb52fd8ad0f2413072948c30166aee Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 23 Oct 2023 09:28:41 -0700 Subject: [PATCH] LibPDF: Don't accidentally form new tokens on pages with contents arrays A page's /Contents can be an array of streams, and the page's contents are then as if those streams are concatenated. Most of the time, a stream ends with whitespace. But in some cases (e.g. 0000642.pdf from 0000.zip from the pdfa dataset), the first stream ends with an operator (`Q`) and the next stream starts with one (`q`), and the concatenation would form a new, unkonwn operator (`Qq`). Separate the streams' contents with a space to prevent that. Reduces numbers of PDF files we fail to open in the -n 500 case from 11 to 10 (in either case, we then crash on 18 of the PDFs that we do manage to open). --- Userland/Libraries/LibPDF/Page.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Userland/Libraries/LibPDF/Page.cpp b/Userland/Libraries/LibPDF/Page.cpp index efdefc38d48..c8d86cdcf85 100644 --- a/Userland/Libraries/LibPDF/Page.cpp +++ b/Userland/Libraries/LibPDF/Page.cpp @@ -19,13 +19,18 @@ PDFErrorOr Page::page_contents(Document& document) const // "The value may be either a single stream or an array of streams. If the value // is an array, the effect is as if all the streams in the array were concatenated, - // in order, to form a single stream." + // in order, to form a single stream. The division between streams may occur only at + // the boundaries between lexical tokens" if (contents->is()) return TRY(ByteBuffer::copy(contents->cast()->bytes())); + // If one stream ends with (say) a `Q` and the next starts with `q`, that should be + // two distinct tokens. Insert spaces between stream contents to ensure that. ByteBuffer byte_buffer; - for (auto& ref : *contents->cast()) + for (auto& ref : *contents->cast()) { TRY(byte_buffer.try_append(TRY(document.resolve_to(ref))->bytes())); + TRY(byte_buffer.try_append(' ')); + } return byte_buffer; }