Meta: Add a script for rendering many PDFs in parallel

The rendering happens only in-memory, so this is only useful for looking at the crash rate and the reports of missing features. To actually see the output of a file, use pdf --render out.png --page N path/to/input.pdf instead.
Author: https://github.com/nico Commit: https://github.com/SerenityOS/serenity/commit/5b36355be8 Pull-request: https://github.com/SerenityOS/serenity/pull/21512
2025-04-21 20:15:17 +00:00 · 2023-10-20 10:00:27 -04:00 · 2023-10-20 10:00:27 -04:00 · 5b36355be8 · 2024-07-17 07:48:42 +09:00
commit 5b36355be8
parent 34cb506bad
1 changed files with 84 additions and 0 deletions
--- a/Meta/test_pdf.py
+++ b/Meta/test_pdf.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+'''Runs `pdf --debugging-stats` on a bunch of PDF files in parallel.
+
+Give it one or more folders containing PDF files, and the optional -n flag
+to pick a random subset of n PDFs:
+
+    test_pdf.py -n 200 ~/Downloads/0000 ~/src/pdffiles
+
+https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/ has
+8 TB of test PDFs, organized in a bunch of zip files with 1000 PDFs each.
+One of those zip files in unzipped makes for a good input folder.
+'''
+
+import argparse
+import collections
+import glob
+import multiprocessing
+import os
+import random
+import re
+import subprocess
+
+
+Result = collections.namedtuple(
+             'Result', ['filename', 'returncode', 'stdout', 'stderr'])
+
+
+def elide_aslr(s):
+    return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s)
+
+
+def test_pdf(filename):
+    pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf')
+    r = subprocess.run([pdf_path, '--debugging-stats', filename],
+                       capture_output=True)
+    return Result(filename, r.returncode, r.stdout, elide_aslr(r.stderr))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        epilog=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument('input', nargs='+', help='input directories')
+    parser.add_argument('-n', type=int, help='render at most n pdfs')
+    args = parser.parse_args()
+
+    files = []
+    for input_directory in args.input:
+        files += glob.glob(os.path.join(input_directory, '*.pdf'))
+    if args.n is not None:
+        random.seed(42)
+        files = random.sample(files, k=args.n)
+
+    results = multiprocessing.Pool().map(test_pdf, files)
+
+    num_crashes = 0
+    stack_to_files = {}
+    for r in results:
+        print(r.filename)
+        print(r.stdout.decode('utf-8'))
+        if r.returncode != 0:
+            num_crashes += 1
+            stack_to_files.setdefault(r.stderr, []).append(r.filename)
+
+    print('Top 5 crashiest stacks')
+    keys = list(stack_to_files.keys())
+    keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True)
+    for stack in reversed(keys[:5]):
+        files = stack_to_files[stack]
+        print(stack.decode('utf-8'), end='')
+        print(f'In {len(files)} files:')
+        for file in files:
+            print(f'    {file}')
+        print()
+
+    print(f'{len(keys)} distinct crash stacks')
+
+    percent = 100 * num_crashes / len(results)
+    print(f'{num_crashes} crashes ({percent:.1f}%)')
+
+
+if __name__ == '__main__':
+    main()