Skip to content

Commit

Permalink
Handle consistent timeout detection corner case
Browse files Browse the repository at this point in the history
Badger Swarm checks out Badger Sett with "git clone --depth=1"
and we don't want to exclude one-off timeouts.

Following up on #83
  • Loading branch information
ghostwords committed Mar 8, 2024
1 parent 07124e9 commit f428250
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def get_recently_failed_domains():
revisions = revisions.split('\n')

error_pattern = re.compile("(?:Error loading|Exception on) ([^: ]+):")
num_scans = len(revisions)
timeout_pattern = re.compile("Timed out loading ([^ ]+)$")
timeout_counts = {}
logs = []
Expand All @@ -179,9 +180,12 @@ def get_recently_failed_domains():
elif matches := timeout_pattern.search(line):
domain = matches.group(1)
timeout_counts[domain] = timeout_counts.get(domain, 0) + 1
logs.append(log_txt)
if num_scans > 1:
logs.append(log_txt)

if num_scans == 1: # not enough data to look at timeouts
return domains

num_scans = len(revisions)
for domain, count in timeout_counts.items():
if count >= num_scans:
# site timed out in all recent scans
Expand Down

0 comments on commit f428250

Please sign in to comment.