mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-21 09:49:21 +00:00
LibPDF: Scan for PDF file start in first 1024 bytes
Other readers do this too, and files depend on this. Fixes opening these four files from the PDFA 0000.zip dataset: * 0000015.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` before header * 0000408.pdf Starts with UTF-8 BOM * 0000524.pdf Starts with 867 bytes of HTML containing a PHP backtrace * 0000680.pdf Starts with `C:\web\webeuncet\_cat\_docs\_publics\` too
This commit is contained in:
parent
9495f64f91
commit
0bb0c7dac2
Notes:
sideshowbarker
2024-07-17 06:35:23 +09:00
Author: https://github.com/nico
Commit: 0bb0c7dac2
Pull-request: https://github.com/SerenityOS/serenity/pull/22548
3 changed files with 21 additions and 0 deletions
|
@ -97,6 +97,12 @@ ByteString Document::text_string_to_utf8(ByteString const& text_string)
|
|||
|
||||
PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
|
||||
{
|
||||
size_t offset_to_start = TRY(DocumentParser::scan_for_header_start(bytes));
|
||||
if (offset_to_start != 0) {
|
||||
dbgln("warning: PDF header not at start of file, skipping {} bytes", offset_to_start);
|
||||
bytes = bytes.slice(offset_to_start);
|
||||
}
|
||||
|
||||
auto parser = adopt_ref(*new DocumentParser({}, bytes));
|
||||
auto document = adopt_ref(*new Document(parser));
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue