Skip to content

Commit

Permalink
Fix detection of sites that consistently time out
Browse files Browse the repository at this point in the history
The set of consistent timeouts should include sites that timed out
in all scans that they appeared in, even if they didn't appear in
all the recent scans, as long as they appeared in more than one.

Following up on #83
  • Loading branch information
ghostwords committed Mar 1, 2024
1 parent c585935 commit b0f06a0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 8 deletions.
13 changes: 11 additions & 2 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,30 @@ def get_recently_failed_domains():
error_pattern = re.compile("(?:WebDriver|InsecureCertificate)Exception on ([^:]+):")
timeout_pattern = re.compile("Timed out loading (.+)$")
timeout_counts = {}
logs = []

for rev in revisions:
logs = run(f"git show {rev}:log.txt".split(" "))
for line in logs.split('\n'):
log_txt = run(f"git show {rev}:log.txt".split(" "))
for line in log_txt.split('\n'):
if matches := error_pattern.search(line):
domains.add(matches.group(1))
elif matches := timeout_pattern.search(line):
domain = matches.group(1)
if domain != "extension page":
timeout_counts[domain] = timeout_counts.get(domain, 0) + 1
logs.append(log_txt)

num_scans = len(revisions)
for domain, count in timeout_counts.items():
if count >= num_scans:
# site timed out in all recent scans
domains.add(domain)
elif count > 1:
num_visits = sum(1 for log_txt in logs if re.search(
r"Visiting \d+: " + domain, log_txt))
if count == num_visits:
# site timed out in all recent scans **that it appeared in**
domains.add(domain)

return domains

Expand Down
33 changes: 27 additions & 6 deletions tests/sitelist_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,12 @@ def test_excluding_suffixes(self, monkeypatch, exclude_suffixes, expected):
args = ["firefox", "10"]
args.append("--exclude=" + exclude_suffixes)

monkeypatch.setattr(crawler, "get_recently_failed_domains",
lambda: set()) # pylint:disable=unnecessary-lambda

cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))

monkeypatch.setattr(Tranco, "list", self.mock_tranco_list)
monkeypatch.setattr(cr, "exclude_domains", set())

assert cr.get_domain_list() == expected

Expand All @@ -42,6 +44,10 @@ def test_get_domain_list(self, # pylint:disable=too-many-arguments
args = ["firefox", num_sites]
if exclude_suffixes:
args.append("--exclude=" + exclude_suffixes)

monkeypatch.setattr(crawler, "get_recently_failed_domains",
lambda: set()) # pylint:disable=unnecessary-lambda

cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))

monkeypatch.setattr(Tranco, "list", self.mock_tranco_list)
Expand All @@ -54,25 +60,40 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument
cmd = " ".join(cmd)

if cmd == "git rev-list --since='1 week ago' HEAD -- log.txt":
return "abcde\nfghij"
return "abcde\nfghij\nklmno"

if cmd == "git show abcde:log.txt":
return "\n".join(["WebDriverException on example.com: XXX",
return "\n".join(["Visiting 1: example.com",
"WebDriverException on example.com: XXX",
"Visiting 2: example.biz",
"Timed out loading example.biz",
"Visiting 3: example.co.uk",
"Timed out loading example.co.uk",
"Timed out loading extension page",])
"Timed out loading extension page",
"Timed out loading extension page"])

if cmd == "git show fghij:log.txt":
return "\n".join(["WebDriverException on example.org: YYY",
return "\n".join(["Visiting 1: example.org",
"WebDriverException on example.org: YYY",
"Timed out loading extension page",
"Visiting 2: example.co.uk",
"Timed out loading example.co.uk",
"Visiting 3: example.biz",
"Visiting 4: example.website",
"Timed out loading example.website",
"Visiting 5: example.net",
"InsecureCertificateException on example.net: ZZZ"])

if cmd == "git show klmno:log.txt":
return "\n".join(["Visiting 1: example.website",
"Timed out loading example.website"])

return ""

monkeypatch.setattr(crawler, "run", mock_run)

assert crawler.get_recently_failed_domains() == set(["example.com",
"example.net",
"example.org",
"example.co.uk"])
"example.co.uk",
"example.website"])

0 comments on commit b0f06a0

Please sign in to comment.