diff --git a/.github/check_links.py b/.github/check_links.py index c58aa3515..61b6c05df 100644 --- a/.github/check_links.py +++ b/.github/check_links.py @@ -1,7 +1,9 @@ import re import sys import os +import time import requests +from urllib.parse import urlsplit, urlunsplit from concurrent.futures import ThreadPoolExecutor, as_completed @@ -20,17 +22,36 @@ def extract_urls(file_path): content = file.read() matches = URL_PATTERN.findall(content) # Flatten the list of tuples and filter out empty strings - urls = [url for match in matches for url in match if url] + urls = [strip_fragment(url) for match in matches for url in match if url] return urls +def strip_fragment(url): + """ + Removes the fragment (#...) from the URL, so the base URL can be checked. + """ + parsed_url = urlsplit(url) + return urlunsplit((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.query, "")) + + def resolve_relative_url(base_path, url): if not url.startswith(("http://", "https://", "mailto:")): return os.path.abspath(os.path.join(os.path.dirname(base_path), url)) return url -def check_url(url): +def check_url(url, retries=2, delay=2): + """ + Check the validity of a URL, with retries if it fails. + + Args: + url (str): URL to check. + retries (int, optional): Number of retries if the URL check fails. Defaults to 2. + delay (int, optional): Delay in seconds between retries. Defaults to 2. + Returns: + tuple: A tuple containing the URL and a boolean indicating whether it is valid. + """ + if ( "http://localhost:" in url or url in skipped_urls @@ -39,13 +60,23 @@ def check_url(url): or url.startswith("mailto:") ): return url, True - try: - response = requests.head(url, allow_redirects=True, timeout=5) - if response.status_code >= 400: - return url, False - return url, True - except requests.RequestException: - return url, False + + attempts = 0 + while attempts <= retries: + try: + response = requests.head(url, allow_redirects=True, timeout=5) + if response.status_code >= 400: + attempts += 1 + if attempts > retries: + return url, False + time.sleep(delay) + else: + return url, True + except requests.RequestException: + attempts += 1 + if attempts > retries: + return url, False + time.sleep(delay) def check_links_in_file(file_path):