mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-21 20:15:17 +00:00
Meta: Add a script for rendering many PDFs in parallel
The rendering happens only in-memory, so this is only useful for looking at the crash rate and the reports of missing features. To actually see the output of a file, use pdf --render out.png --page N path/to/input.pdf instead.
This commit is contained in:
parent
34cb506bad
commit
5b36355be8
Notes:
sideshowbarker
2024-07-17 07:48:42 +09:00
Author: https://github.com/nico Commit: https://github.com/SerenityOS/serenity/commit/5b36355be8 Pull-request: https://github.com/SerenityOS/serenity/pull/21512
1 changed files with 84 additions and 0 deletions
84
Meta/test_pdf.py
Executable file
84
Meta/test_pdf.py
Executable file
|
@ -0,0 +1,84 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
'''Runs `pdf --debugging-stats` on a bunch of PDF files in parallel.
|
||||
|
||||
Give it one or more folders containing PDF files, and the optional -n flag
|
||||
to pick a random subset of n PDFs:
|
||||
|
||||
test_pdf.py -n 200 ~/Downloads/0000 ~/src/pdffiles
|
||||
|
||||
https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/ has
|
||||
8 TB of test PDFs, organized in a bunch of zip files with 1000 PDFs each.
|
||||
One of those zip files in unzipped makes for a good input folder.
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import collections
|
||||
import glob
|
||||
import multiprocessing
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
|
||||
Result = collections.namedtuple(
|
||||
'Result', ['filename', 'returncode', 'stdout', 'stderr'])
|
||||
|
||||
|
||||
def elide_aslr(s):
|
||||
return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s)
|
||||
|
||||
|
||||
def test_pdf(filename):
|
||||
pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf')
|
||||
r = subprocess.run([pdf_path, '--debugging-stats', filename],
|
||||
capture_output=True)
|
||||
return Result(filename, r.returncode, r.stdout, elide_aslr(r.stderr))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
epilog=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument('input', nargs='+', help='input directories')
|
||||
parser.add_argument('-n', type=int, help='render at most n pdfs')
|
||||
args = parser.parse_args()
|
||||
|
||||
files = []
|
||||
for input_directory in args.input:
|
||||
files += glob.glob(os.path.join(input_directory, '*.pdf'))
|
||||
if args.n is not None:
|
||||
random.seed(42)
|
||||
files = random.sample(files, k=args.n)
|
||||
|
||||
results = multiprocessing.Pool().map(test_pdf, files)
|
||||
|
||||
num_crashes = 0
|
||||
stack_to_files = {}
|
||||
for r in results:
|
||||
print(r.filename)
|
||||
print(r.stdout.decode('utf-8'))
|
||||
if r.returncode != 0:
|
||||
num_crashes += 1
|
||||
stack_to_files.setdefault(r.stderr, []).append(r.filename)
|
||||
|
||||
print('Top 5 crashiest stacks')
|
||||
keys = list(stack_to_files.keys())
|
||||
keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True)
|
||||
for stack in reversed(keys[:5]):
|
||||
files = stack_to_files[stack]
|
||||
print(stack.decode('utf-8'), end='')
|
||||
print(f'In {len(files)} files:')
|
||||
for file in files:
|
||||
print(f' {file}')
|
||||
print()
|
||||
|
||||
print(f'{len(keys)} distinct crash stacks')
|
||||
|
||||
percent = 100 * num_crashes / len(results)
|
||||
print(f'{num_crashes} crashes ({percent:.1f}%)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Reference in a new issue