Fix detection of sites that consistently time out

The set of consistent timeouts should include sites that timed out in all scans that they appeared in, even if they didn't appear in all the recent scans, as long as they appeared in more than one. Following up on #83
EFForg · Mar 1, 2024 · b0f06a0 · b0f06a0
1 parent c585935
commit b0f06a0
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 8 deletions.
diff --git a/crawler.py b/crawler.py
@@ -168,21 +168,30 @@ def get_recently_failed_domains():
     error_pattern = re.compile("(?:WebDriver|InsecureCertificate)Exception on ([^:]+):")
     timeout_pattern = re.compile("Timed out loading (.+)$")
     timeout_counts = {}
+    logs = []
 
     for rev in revisions:
-        logs = run(f"git show {rev}:log.txt".split(" "))
-        for line in logs.split('\n'):
+        log_txt = run(f"git show {rev}:log.txt".split(" "))
+        for line in log_txt.split('\n'):
             if matches := error_pattern.search(line):
                 domains.add(matches.group(1))
             elif matches := timeout_pattern.search(line):
                 domain = matches.group(1)
                 if domain != "extension page":
                     timeout_counts[domain] = timeout_counts.get(domain, 0) + 1
+        logs.append(log_txt)
 
     num_scans = len(revisions)
     for domain, count in timeout_counts.items():
         if count >= num_scans:
+            # site timed out in all recent scans
             domains.add(domain)
+        elif count > 1:
+            num_visits = sum(1 for log_txt in logs if re.search(
+                r"Visiting \d+: " + domain, log_txt))
+            if count == num_visits:
+                # site timed out in all recent scans **that it appeared in**
+                domains.add(domain)
 
     return domains
 

diff --git a/tests/sitelist_test.py b/tests/sitelist_test.py
@@ -23,10 +23,12 @@ def test_excluding_suffixes(self, monkeypatch, exclude_suffixes, expected):
         args = ["firefox", "10"]
         args.append("--exclude=" + exclude_suffixes)
 
+        monkeypatch.setattr(crawler, "get_recently_failed_domains",
+                            lambda: set()) # pylint:disable=unnecessary-lambda
+
         cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))
 
         monkeypatch.setattr(Tranco, "list", self.mock_tranco_list)
-        monkeypatch.setattr(cr, "exclude_domains", set())
 
         assert cr.get_domain_list() == expected
 
@@ -42,6 +44,10 @@ def test_get_domain_list(self, # pylint:disable=too-many-arguments
         args = ["firefox", num_sites]
         if exclude_suffixes:
             args.append("--exclude=" + exclude_suffixes)
+
+        monkeypatch.setattr(crawler, "get_recently_failed_domains",
+                            lambda: set()) # pylint:disable=unnecessary-lambda
+
         cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))
 
         monkeypatch.setattr(Tranco, "list", self.mock_tranco_list)
@@ -54,25 +60,40 @@ def mock_run(cmd, cwd=None): # pylint:disable=unused-argument
             cmd = " ".join(cmd)
 
             if cmd == "git rev-list --since='1 week ago' HEAD -- log.txt":
-                return "abcde\nfghij"
+                return "abcde\nfghij\nklmno"
 
             if cmd == "git show abcde:log.txt":
-                return "\n".join(["WebDriverException on example.com: XXX",
+                return "\n".join(["Visiting 1: example.com",
+                    "WebDriverException on example.com: XXX",
+                    "Visiting 2: example.biz",
                     "Timed out loading example.biz",
+                    "Visiting 3: example.co.uk",
                     "Timed out loading example.co.uk",
-                    "Timed out loading extension page",])
+                    "Timed out loading extension page",
+                    "Timed out loading extension page"])
 
             if cmd == "git show fghij:log.txt":
-                return "\n".join(["WebDriverException on example.org: YYY",
+                return "\n".join(["Visiting 1: example.org",
+                    "WebDriverException on example.org: YYY",
                     "Timed out loading extension page",
+                    "Visiting 2: example.co.uk",
                     "Timed out loading example.co.uk",
+                    "Visiting 3: example.biz",
+                    "Visiting 4: example.website",
+                    "Timed out loading example.website",
+                    "Visiting 5: example.net",
                     "InsecureCertificateException on example.net: ZZZ"])
 
+            if cmd == "git show klmno:log.txt":
+                return "\n".join(["Visiting 1: example.website",
+                    "Timed out loading example.website"])
+
             return ""
 
         monkeypatch.setattr(crawler, "run", mock_run)
 
         assert crawler.get_recently_failed_domains() == set(["example.com",
                                                              "example.net",
                                                              "example.org",
-                                                             "example.co.uk"])
+                                                             "example.co.uk",
+                                                             "example.website"])