Meta: Make import-wpt-test.py use html.parser, not BeautifulSoup

This change switches the Meta/import-wpt-test.py script to using the
standard html.parser module rather than BeautifulSoup.

Otherwise, without this change, when a contributor first tries to run
the script, if they don’t have BeautifulSoup installed, it will fail.

Note that this patch also includes an unrelated small change that
switches to using os.path.normpath — rather than Path.absolute() — to
“normalize” the destination names of the downloaded test files.
This commit is contained in:
sideshowbarker 2024-11-01 16:09:06 +09:00 committed by Andreas Kling
commit 286511c4cf
Notes: github-actions[bot] 2024-11-01 11:29:03 +00:00

View file

@ -2,8 +2,9 @@
import os import os
import sys import sys
from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
from urllib.request import urlopen from urllib.request import urlopen
from collections import namedtuple from collections import namedtuple
@ -13,18 +14,16 @@ wpt_import_path = 'Tests/LibWeb/Text/input/wpt-import'
wpt_expected_path = 'Tests/LibWeb/Text/expected/wpt-import' wpt_expected_path = 'Tests/LibWeb/Text/expected/wpt-import'
PathMapping = namedtuple('PathMapping', ['source', 'destination']) PathMapping = namedtuple('PathMapping', ['source', 'destination'])
src_values = []
def get_script_sources(page_source):
# Find all the <script> tags
scripts = [script for script in page_source.findAll('script')]
# Get the src attribute of each script tag class ScriptSrcValueFinder(HTMLParser):
sources = list(map(lambda x: x.get('src'), scripts))
# Remove None values def handle_starttag(self, tag, attrs):
sources = list(filter(lambda x: x is not None, sources)) if tag == "script":
attr_dict = dict(attrs)
return sources if "src" in attr_dict:
src_values.append(attr_dict["src"])
def map_to_path(sources, is_resource=True, resource_path=None): def map_to_path(sources, is_resource=True, resource_path=None):
@ -65,17 +64,15 @@ def modify_sources(files):
parent_folder_path = '../' * parent_folder_count parent_folder_path = '../' * parent_folder_count
with open(file, 'r') as f: with open(file, 'r') as f:
page_source = BeautifulSoup(f.read(), 'html.parser') page_source = f.read()
# Iterate all scripts and overwrite the src attribute # Iterate all scripts and overwrite the src attribute
scripts = [script for script in page_source.findAll('script')] for i, src_value in enumerate(src_values):
for script in scripts: if src_value.startswith('/'):
if script.get('src') is not None: new_src_value = parent_folder_path + src_value[1::]
if script['src'].startswith('/'): page_source = page_source.replace(src_value, new_src_value)
script['src'] = parent_folder_path + script['src'][1::] with open(file, 'w') as f:
f.write(str(page_source))
with open(file, 'w') as f:
f.write(str(page_source))
def download_files(filepaths): def download_files(filepaths):
@ -83,7 +80,7 @@ def download_files(filepaths):
for file in filepaths: for file in filepaths:
source = urljoin(file.source, "/".join(file.source.split('/')[3:])) source = urljoin(file.source, "/".join(file.source.split('/')[3:]))
destination = Path(file.destination).absolute() destination = Path(os.path.normpath(file.destination))
if destination.exists(): if destination.exists():
print(f"Skipping {destination} as it already exists") print(f"Skipping {destination} as it already exists")
@ -132,13 +129,15 @@ def main():
main_paths = map_to_path(main_file, False) main_paths = map_to_path(main_file, False)
files_to_modify = download_files(main_paths) files_to_modify = download_files(main_paths)
create_expectation_files(main_paths) create_expectation_files(main_paths)
with urlopen(url_to_import) as response:
page = response.read().decode("utf-8")
parser = ScriptSrcValueFinder()
parser.feed(page)
modify_sources(files_to_modify) modify_sources(files_to_modify)
script_paths = map_to_path(src_values, True, resource_path)
page = urlopen(url_to_import)
page_source = BeautifulSoup(page, 'html.parser')
scripts = get_script_sources(page_source)
script_paths = map_to_path(scripts, True, resource_path)
download_files(script_paths) download_files(script_paths)