Skip to content

Commit

Permalink
FIX: Fixing pre-commit check_links (#462)
Browse files Browse the repository at this point in the history
  • Loading branch information
rlundeen2 authored Oct 11, 2024
1 parent 6b6c57b commit ecb51c1
Showing 1 changed file with 40 additions and 9 deletions.
49 changes: 40 additions & 9 deletions .github/check_links.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
import sys
import os
import time
import requests
from urllib.parse import urlsplit, urlunsplit
from concurrent.futures import ThreadPoolExecutor, as_completed


Expand All @@ -20,17 +22,36 @@ def extract_urls(file_path):
content = file.read()
matches = URL_PATTERN.findall(content)
# Flatten the list of tuples and filter out empty strings
urls = [url for match in matches for url in match if url]
urls = [strip_fragment(url) for match in matches for url in match if url]
return urls


def strip_fragment(url):
"""
Removes the fragment (#...) from the URL, so the base URL can be checked.
"""
parsed_url = urlsplit(url)
return urlunsplit((parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.query, ""))


def resolve_relative_url(base_path, url):
if not url.startswith(("http://", "https://", "mailto:")):
return os.path.abspath(os.path.join(os.path.dirname(base_path), url))
return url


def check_url(url):
def check_url(url, retries=2, delay=2):
"""
Check the validity of a URL, with retries if it fails.
Args:
url (str): URL to check.
retries (int, optional): Number of retries if the URL check fails. Defaults to 2.
delay (int, optional): Delay in seconds between retries. Defaults to 2.
Returns:
tuple: A tuple containing the URL and a boolean indicating whether it is valid.
"""

if (
"http://localhost:" in url
or url in skipped_urls
Expand All @@ -39,13 +60,23 @@ def check_url(url):
or url.startswith("mailto:")
):
return url, True
try:
response = requests.head(url, allow_redirects=True, timeout=5)
if response.status_code >= 400:
return url, False
return url, True
except requests.RequestException:
return url, False

attempts = 0
while attempts <= retries:
try:
response = requests.head(url, allow_redirects=True, timeout=5)
if response.status_code >= 400:
attempts += 1
if attempts > retries:
return url, False
time.sleep(delay)
else:
return url, True
except requests.RequestException:
attempts += 1
if attempts > retries:
return url, False
time.sleep(delay)


def check_links_in_file(file_path):
Expand Down

0 comments on commit ecb51c1

Please sign in to comment.