From ba3b98b27489f8ac01b3cc3a039ca93b45737135 Mon Sep 17 00:00:00 2001 From: Mateusz <2871798+orhtej2@users.noreply.github.com> Date: Thu, 28 Nov 2024 21:42:10 +0100 Subject: [PATCH] Use proper URI parsing for asset tests. --- lib/curl_tests.py | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/lib/curl_tests.py b/lib/curl_tests.py index 5e73d10..112ce6f 100644 --- a/lib/curl_tests.py +++ b/lib/curl_tests.py @@ -6,7 +6,7 @@ import tempfile import pycurl from bs4 import BeautifulSoup -from urllib.parse import urlencode, urljoin +from urllib.parse import urlencode, urlparse from io import BytesIO DOMAIN = os.environ["DOMAIN"] @@ -163,9 +163,6 @@ def test( content = content.get_text().strip() if content else "" content = re.sub(r"[\t\n\s]{3,}", "\n\n", content) - base_tag = html.find("base") - base = base_tag.get("href", "") if base_tag else "" - errors = [] if expect_effective_url is None and "/yunohost/sso" in effective_url: errors.append( @@ -194,45 +191,41 @@ def test( assets_to_check = [] stylesheets = html.find_all("link", rel="stylesheet", href=True) stylesheets = [ - s + s["href"] for s in stylesheets if "ynh_portal" not in s["href"] and "ynhtheme" not in s["href"] and "ynh_overlay" not in s["href"] ] if stylesheets: - assets_to_check.append(stylesheets[0]["href"]) + for sheet in stylesheets: + parsed = urlparse(sheet) + if parsed.netloc != "" and parsed.netloc != domain: + continue + assets_to_check.append(parsed._replace(netloc=domain)._replace(scheme="https").geturl()) + break + js = html.find_all("script", src=True) js = [ - s + s["src"] for s in js if "ynh_portal" not in s["src"] and "ynhtheme" not in s["src"] and "ynh_overlay" not in s["src"] ] if js: - assets_to_check.append(js[0]["src"]) + for js in js: + parsed = urlparse(js) + if parsed.netloc != "" and parsed.netloc != domain: + continue + assets_to_check.append(parsed._replace(netloc=domain)._replace(scheme="https").geturl()) + break + if not assets_to_check: print( "\033[1m\033[93mWARN\033[0m auto_test_assets set to true, but no js/css asset found in this page" ) - for asset in assets_to_check: - # FIXME : this is pretty clumsy, should probably be replaced with a proper URL parsing to serparate domains etc... - if asset.startswith(f"//"): - asset = f"https:{asset}" - if asset.startswith(f"https://") or asset.startswith(f"http://"): - if asset.startswith(f"https://{domain}"): - asset = asset.replace(f"https://{domain}", "") - else: - print( - f"\033[1m\033[93mWARN\033[0m Found asset '{asset}' which seems to be hosted on a third party, external website ... Not super great for privacy etc... ?" - ) - continue - elif asset.startswith(f"{domain}/"): - asset = asset.replace(f"{domain}/", "") - if not asset.startswith("/"): - asset = urljoin(base + "/", asset) - resolved_asset_url = urljoin(f"https://{domain}", asset) + for resolved_asset_url in assets_to_check: asset_code, _, effective_asset_url = curl( resolved_asset_url, use_cookies=cookies )