From b0f06a0bf8cf99ab0cf644fbb2411276d95521ce Mon Sep 17 00:00:00 2001 From: Alexei Date: Fri, 1 Mar 2024 15:15:52 -0500 Subject: [PATCH] Fix detection of sites that consistently time out The set of consistent timeouts should include sites that timed out in all scans that they appeared in, even if they didn't appear in all the recent scans, as long as they appeared in more than one. Following up on https://github.com/EFForg/badger-sett/issues/83 --- crawler.py | 13 +++++++++++-- tests/sitelist_test.py | 33 +++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/crawler.py b/crawler.py index 1dbea5d..01ad5b1 100755 --- a/crawler.py +++ b/crawler.py @@ -168,21 +168,30 @@ def get_recently_failed_domains(): error_pattern = re.compile("(?:WebDriver|InsecureCertificate)Exception on ([^:]+):") timeout_pattern = re.compile("Timed out loading (.+)$") timeout_counts = {} + logs = [] for rev in revisions: - logs = run(f"git show {rev}:log.txt".split(" ")) - for line in logs.split('\n'): + log_txt = run(f"git show {rev}:log.txt".split(" ")) + for line in log_txt.split('\n'): if matches := error_pattern.search(line): domains.add(matches.group(1)) elif matches := timeout_pattern.search(line): domain = matches.group(1) if domain != "extension page": timeout_counts[domain] = timeout_counts.get(domain, 0) + 1 + logs.append(log_txt) num_scans = len(revisions) for domain, count in timeout_counts.items(): if count >= num_scans: + # site timed out in all recent scans domains.add(domain) + elif count > 1: + num_visits = sum(1 for log_txt in logs if re.search( + r"Visiting \d+: " + domain, log_txt)) + if count == num_visits: + # site timed out in all recent scans **that it appeared in** + domains.add(domain) return domains diff --git a/tests/sitelist_test.py b/tests/sitelist_test.py index e12c821..d6c1cd2 100644 --- a/tests/sitelist_test.py +++ b/tests/sitelist_test.py @@ -23,10 +23,12 @@ def test_excluding_suffixes(self, monkeypatch, exclude_suffixes, expected): args = ["firefox", "10"] args.append("--exclude=" + exclude_suffixes) + monkeypatch.setattr(crawler, "get_recently_failed_domains", + lambda: set()) # pylint:disable=unnecessary-lambda + cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args)) monkeypatch.setattr(Tranco, "list", self.mock_tranco_list) - monkeypatch.setattr(cr, "exclude_domains", set()) assert cr.get_domain_list() == expected @@ -42,6 +44,10 @@ def test_get_domain_list(self, # pylint:disable=too-many-arguments args = ["firefox", num_sites] if exclude_suffixes: args.append("--exclude=" + exclude_suffixes) + + monkeypatch.setattr(crawler, "get_recently_failed_domains", + lambda: set()) # pylint:disable=unnecessary-lambda + cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args)) monkeypatch.setattr(Tranco, "list", self.mock_tranco_list) @@ -54,20 +60,34 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument cmd = " ".join(cmd) if cmd == "git rev-list --since='1 week ago' HEAD -- log.txt": - return "abcde\nfghij" + return "abcde\nfghij\nklmno" if cmd == "git show abcde:log.txt": - return "\n".join(["WebDriverException on example.com: XXX", + return "\n".join(["Visiting 1: example.com", + "WebDriverException on example.com: XXX", + "Visiting 2: example.biz", "Timed out loading example.biz", + "Visiting 3: example.co.uk", "Timed out loading example.co.uk", - "Timed out loading extension page",]) + "Timed out loading extension page", + "Timed out loading extension page"]) if cmd == "git show fghij:log.txt": - return "\n".join(["WebDriverException on example.org: YYY", + return "\n".join(["Visiting 1: example.org", + "WebDriverException on example.org: YYY", "Timed out loading extension page", + "Visiting 2: example.co.uk", "Timed out loading example.co.uk", + "Visiting 3: example.biz", + "Visiting 4: example.website", + "Timed out loading example.website", + "Visiting 5: example.net", "InsecureCertificateException on example.net: ZZZ"]) + if cmd == "git show klmno:log.txt": + return "\n".join(["Visiting 1: example.website", + "Timed out loading example.website"]) + return "" monkeypatch.setattr(crawler, "run", mock_run) @@ -75,4 +95,5 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument assert crawler.get_recently_failed_domains() == set(["example.com", "example.net", "example.org", - "example.co.uk"]) + "example.co.uk", + "example.website"])