From bcce4aad5416e02ba2a5025a50bfbc35d4cc7c34 Mon Sep 17 00:00:00 2001
From: Alexei <alexeiatyahoodotcom@gmail.com>
Date: Mon, 19 Feb 2024 16:39:06 -0500
Subject: [PATCH] Add unit tests for --exclude flag

---
 .github/workflows/pythonapp.yml | 15 ++++++----
 .prospector.yaml                |  2 ++
 crawler.py                      | 50 ++++++++++++++++-----------------
 requirements.txt                |  1 +
 tests/__init__.py               |  0
 tests/sitelist_test.py          | 39 +++++++++++++++++++++++++
 6 files changed, 76 insertions(+), 31 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/sitelist_test.py

diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
index bb4bfae4..72c90ab7 100644
--- a/.github/workflows/pythonapp.yml
+++ b/.github/workflows/pythonapp.yml
@@ -1,21 +1,26 @@
-name: Python app
+name: Static analysis checks and unit tests
 
-on: [push]
+on: [pull_request, push, workflow_dispatch]
 
 jobs:
-  lint:
-
+  lint_and_tests:
     runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v3
+
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
         python-version: 3.8
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
-    - name: Run Prospector
+
+    - name: Run static analysis
       run: prospector -X
+
+    - name: Run unit tests
+      run: pytest
diff --git a/.prospector.yaml b/.prospector.yaml
index 5dbf023e..ac75de3a 100644
--- a/.prospector.yaml
+++ b/.prospector.yaml
@@ -1,5 +1,7 @@
 strictness: medium
 
+test-warnings: true
+
 pylint:
   disable:
     - invalid-name
diff --git a/crawler.py b/crawler.py
index fda9904f..0c3ea749 100755
--- a/crawler.py
+++ b/crawler.py
@@ -269,42 +269,41 @@ def internal_link(page_url, link_href):
 
 
 class Crawler:
-    def __init__(self, args):
-        self.browser = args.browser
-        self.browser_binary = args.browser_binary
-        self.chromedriver_path = args.chromedriver_path
-        self.domain_list = args.domain_list
-        self.exclude_suffixes = args.exclude
+    def __init__(self, opts):
+        self.browser_binary = opts.browser_binary
+        self.browser = opts.browser
+        self.chromedriver_path = opts.chromedriver_path
+        self.domain_list = opts.domain_list
         self.exclude_domains = get_recently_failed_domains()
-        self.firefox_tracking_protection = args.firefox_tracking_protection
-        self.load_extension = args.load_extension
-        self.no_blocking = args.no_blocking
-        self.load_data_ignore_sites = args.load_data_ignore_sites
-        self.num_sites = args.num_sites
-        self.out_dir = args.out_dir
-        self.pb_dir = args.pb_dir
-        self.take_screenshots = args.take_screenshots
-        self.timeout = args.timeout
-        self.wait_time = args.wait_time
-
-        # version is based on when the crawl started
+        self.exclude_suffixes = opts.exclude
+        self.firefox_tracking_protection = opts.firefox_tracking_protection
+        self.last_data = None
+        self.load_data_ignore_sites = opts.load_data_ignore_sites
+        self.load_extension = opts.load_extension
+        self.logger = logging.getLogger()
+        self.no_blocking = opts.no_blocking
+        self.num_sites = opts.num_sites
+        self.out_dir = opts.out_dir
+        self.pb_dir = opts.pb_dir
+        self.take_screenshots = opts.take_screenshots
+        self.timeout = opts.timeout
         self.version = time.strftime('%Y.%-m.%-d', time.localtime())
+        self.wait_time = opts.wait_time
 
-        self.last_data = None
+        pathlib.Path(self.out_dir).mkdir(exist_ok=True)
 
-    def init_logging(self):
-        self.logger = logging.getLogger()
+    def init_logging(self, log_stdout):
         self.logger.setLevel(logging.INFO)
+
         log_fmt = logging.Formatter('%(asctime)s %(message)s')
 
         # by default, just log to file
-        pathlib.Path(self.out_dir).mkdir(exist_ok=True)
         fh = logging.FileHandler(os.path.join(self.out_dir, 'log.txt'))
         fh.setFormatter(log_fmt)
         self.logger.addHandler(fh)
 
         # log to stdout as well if configured
-        if args.log_stdout:
+        if log_stdout:
             sh = logging.StreamHandler(sys.stdout)
             sh.setFormatter(log_fmt)
             self.logger.addHandler(sh)
@@ -1106,14 +1105,13 @@ def save(self, data, name='results.json'):
 
 
 if __name__ == '__main__':
-    ap = create_argument_parser()
-    args = ap.parse_args()
+    args = create_argument_parser().parse_args()
 
     # create an XVFB virtual display (to avoid opening an actual browser)
     with Xvfb(width=1920, height=1200) if not args.no_xvfb else contextlib.suppress():
         crawler = Crawler(args)
 
-        crawler.init_logging()
+        crawler.init_logging(args.log_stdout)
 
         crawler.logger.info("Fetching TLD definitions ...")
         crawler.tld_extract = TLDExtract(cache_dir=False, include_psl_private_domains=True)
diff --git a/requirements.txt b/requirements.txt
index 6ef7c720..cbd60d84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 colorama==0.3.9
 prospector==1.9.0
+pytest
 selenium
 tldextract==3.1.2
 tranco==0.7.1
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/sitelist_test.py b/tests/sitelist_test.py
new file mode 100644
index 00000000..ef63478c
--- /dev/null
+++ b/tests/sitelist_test.py
@@ -0,0 +1,39 @@
+import pytest
+
+import crawler
+
+from tranco import Tranco
+
+
+class TestSitelist:
+
+    @pytest.mark.parametrize("num_sites, exclude, expected", [
+        (10, None, ["example.com", "example.net", "example.org"]),
+        (1, None, ["example.com"]),
+        (10, ".com", ["example.net", "example.org"]),
+        (10, ".gov,.mil,.net,.org", ["example.com"]),
+        (1, ".gov", ["example.com"])])
+    def test_exclude_suffixes(self, monkeypatch, num_sites, exclude, expected):
+        args = [f"--num-sites={num_sites}"]
+        if exclude:
+            args.append("--exclude=" + exclude)
+        cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))
+
+        # mock out Tranco list
+        class MockResponse:
+            def top(self):
+                return ["example.com", "example.net", "example.org"]
+
+        def mock_get(self, list_version): # pylint:disable=unused-argument
+            return MockResponse()
+
+        monkeypatch.setattr(Tranco, "list", mock_get)
+
+        # also clear exclude_domains
+        monkeypatch.setattr(cr, "exclude_domains", set())
+
+        assert cr.get_domain_list() == expected
+
+    @pytest.mark.skip()
+    def test_recently_failed_domains(self):
+        pass