diff --git a/crawler.py b/crawler.py index 1dbea5d..01ad5b1 100755 --- a/crawler.py +++ b/crawler.py @@ -168,21 +168,30 @@ def get_recently_failed_domains(): error_pattern = re.compile("(?:WebDriver|InsecureCertificate)Exception on ([^:]+):") timeout_pattern = re.compile("Timed out loading (.+)$") timeout_counts = {} + logs = [] for rev in revisions: - logs = run(f"git show {rev}:log.txt".split(" ")) - for line in logs.split('\n'): + log_txt = run(f"git show {rev}:log.txt".split(" ")) + for line in log_txt.split('\n'): if matches := error_pattern.search(line): domains.add(matches.group(1)) elif matches := timeout_pattern.search(line): domain = matches.group(1) if domain != "extension page": timeout_counts[domain] = timeout_counts.get(domain, 0) + 1 + logs.append(log_txt) num_scans = len(revisions) for domain, count in timeout_counts.items(): if count >= num_scans: + # site timed out in all recent scans domains.add(domain) + elif count > 1: + num_visits = sum(1 for log_txt in logs if re.search( + r"Visiting \d+: " + domain, log_txt)) + if count == num_visits: + # site timed out in all recent scans **that it appeared in** + domains.add(domain) return domains diff --git a/tests/sitelist_test.py b/tests/sitelist_test.py index e12c821..d6c1cd2 100644 --- a/tests/sitelist_test.py +++ b/tests/sitelist_test.py @@ -23,10 +23,12 @@ def test_excluding_suffixes(self, monkeypatch, exclude_suffixes, expected): args = ["firefox", "10"] args.append("--exclude=" + exclude_suffixes) + monkeypatch.setattr(crawler, "get_recently_failed_domains", + lambda: set()) # pylint:disable=unnecessary-lambda + cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args)) monkeypatch.setattr(Tranco, "list", self.mock_tranco_list) - monkeypatch.setattr(cr, "exclude_domains", set()) assert cr.get_domain_list() == expected @@ -42,6 +44,10 @@ def test_get_domain_list(self, # pylint:disable=too-many-arguments args = ["firefox", num_sites] if exclude_suffixes: args.append("--exclude=" + exclude_suffixes) + + monkeypatch.setattr(crawler, "get_recently_failed_domains", + lambda: set()) # pylint:disable=unnecessary-lambda + cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args)) monkeypatch.setattr(Tranco, "list", self.mock_tranco_list) @@ -54,20 +60,34 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument cmd = " ".join(cmd) if cmd == "git rev-list --since='1 week ago' HEAD -- log.txt": - return "abcde\nfghij" + return "abcde\nfghij\nklmno" if cmd == "git show abcde:log.txt": - return "\n".join(["WebDriverException on example.com: XXX", + return "\n".join(["Visiting 1: example.com", + "WebDriverException on example.com: XXX", + "Visiting 2: example.biz", "Timed out loading example.biz", + "Visiting 3: example.co.uk", "Timed out loading example.co.uk", - "Timed out loading extension page",]) + "Timed out loading extension page", + "Timed out loading extension page"]) if cmd == "git show fghij:log.txt": - return "\n".join(["WebDriverException on example.org: YYY", + return "\n".join(["Visiting 1: example.org", + "WebDriverException on example.org: YYY", "Timed out loading extension page", + "Visiting 2: example.co.uk", "Timed out loading example.co.uk", + "Visiting 3: example.biz", + "Visiting 4: example.website", + "Timed out loading example.website", + "Visiting 5: example.net", "InsecureCertificateException on example.net: ZZZ"]) + if cmd == "git show klmno:log.txt": + return "\n".join(["Visiting 1: example.website", + "Timed out loading example.website"]) + return "" monkeypatch.setattr(crawler, "run", mock_run) @@ -75,4 +95,5 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument assert crawler.get_recently_failed_domains() == set(["example.com", "example.net", "example.org", - "example.co.uk"]) + "example.co.uk", + "example.website"])