Meta: Make import-wpt-test.py use html.parser, not BeautifulSoup

This change switches the Meta/import-wpt-test.py script to using the standard html.parser module rather than BeautifulSoup. Otherwise, without this change, when a contributor first tries to run the script, if they don’t have BeautifulSoup installed, it will fail. Note that this patch also includes an unrelated small change that switches to using os.path.normpath — rather than Path.absolute() — to “normalize” the destination names of the downloaded test files.
Author: https://github.com/sideshowbarker Commit: https://github.com/LadybirdBrowser/ladybird/commit/286511c4cf9 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/2096
2025-04-22 12:35:14 +00:00 · 2024-11-01 16:09:06 +09:00 · 2024-11-01 16:09:06 +09:00 · 286511c4cf · 2024-11-01 11:29:03 +00:00
commit 286511c4cf
parent 56e1c0e7ee
1 changed files with 26 additions and 27 deletions
--- a/Meta/import-wpt-test.py
+++ b/Meta/import-wpt-test.py
@ -2,8 +2,9 @@

 import os
 import sys
+
+from html.parser import HTMLParser
 from pathlib import Path
-from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from urllib.request import urlopen
 from collections import namedtuple
@ -13,18 +14,16 @@ wpt_import_path = 'Tests/LibWeb/Text/input/wpt-import'
 wpt_expected_path = 'Tests/LibWeb/Text/expected/wpt-import'
 PathMapping = namedtuple('PathMapping', ['source', 'destination'])

+src_values = []

-def get_script_sources(page_source):
-    # Find all the <script> tags
-    scripts = [script for script in page_source.findAll('script')]

-    # Get the src attribute of each script tag
-    sources = list(map(lambda x: x.get('src'), scripts))
+class ScriptSrcValueFinder(HTMLParser):

-    # Remove None values
-    sources = list(filter(lambda x: x is not None, sources))
-
-    return sources
+    def handle_starttag(self, tag, attrs):
+        if tag == "script":
+            attr_dict = dict(attrs)
+            if "src" in attr_dict:
+                src_values.append(attr_dict["src"])


 def map_to_path(sources, is_resource=True, resource_path=None):
@ -65,17 +64,15 @@ def modify_sources(files):
        parent_folder_path = '../' * parent_folder_count

        with open(file, 'r') as f:
-            page_source = BeautifulSoup(f.read(), 'html.parser')
+            page_source = f.read()

-            # Iterate all scripts and overwrite the src attribute
-            scripts = [script for script in page_source.findAll('script')]
-            for script in scripts:
-                if script.get('src') is not None:
-                    if script['src'].startswith('/'):
-                        script['src'] = parent_folder_path + script['src'][1::]
-
-            with open(file, 'w') as f:
-                f.write(str(page_source))
+        # Iterate all scripts and overwrite the src attribute
+        for i, src_value in enumerate(src_values):
+            if src_value.startswith('/'):
+                new_src_value = parent_folder_path + src_value[1::]
+                page_source = page_source.replace(src_value, new_src_value)
+                with open(file, 'w') as f:
+                    f.write(str(page_source))


 def download_files(filepaths):
@ -83,7 +80,7 @@ def download_files(filepaths):

    for file in filepaths:
        source = urljoin(file.source, "/".join(file.source.split('/')[3:]))
-        destination = Path(file.destination).absolute()
+        destination = Path(os.path.normpath(file.destination))

        if destination.exists():
            print(f"Skipping {destination} as it already exists")
@ -132,13 +129,15 @@ def main():
    main_paths = map_to_path(main_file, False)
    files_to_modify = download_files(main_paths)
    create_expectation_files(main_paths)
+
+    with urlopen(url_to_import) as response:
+        page = response.read().decode("utf-8")
+
+    parser = ScriptSrcValueFinder()
+    parser.feed(page)
+
    modify_sources(files_to_modify)
-
-    page = urlopen(url_to_import)
-    page_source = BeautifulSoup(page, 'html.parser')
-
-    scripts = get_script_sources(page_source)
-    script_paths = map_to_path(scripts, True, resource_path)
+    script_paths = map_to_path(src_values, True, resource_path)
    download_files(script_paths)