From bcce4aad5416e02ba2a5025a50bfbc35d4cc7c34 Mon Sep 17 00:00:00 2001 From: Alexei Date: Mon, 19 Feb 2024 16:39:06 -0500 Subject: [PATCH] Add unit tests for --exclude flag --- .github/workflows/pythonapp.yml | 15 ++++++---- .prospector.yaml | 2 ++ crawler.py | 50 ++++++++++++++++----------------- requirements.txt | 1 + tests/__init__.py | 0 tests/sitelist_test.py | 39 +++++++++++++++++++++++++ 6 files changed, 76 insertions(+), 31 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/sitelist_test.py diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index bb4bfae4..72c90ab7 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -1,21 +1,26 @@ -name: Python app +name: Static analysis checks and unit tests -on: [push] +on: [pull_request, push, workflow_dispatch] jobs: - lint: - + lint_and_tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.8 + - name: Install dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt - - name: Run Prospector + + - name: Run static analysis run: prospector -X + + - name: Run unit tests + run: pytest diff --git a/.prospector.yaml b/.prospector.yaml index 5dbf023e..ac75de3a 100644 --- a/.prospector.yaml +++ b/.prospector.yaml @@ -1,5 +1,7 @@ strictness: medium +test-warnings: true + pylint: disable: - invalid-name diff --git a/crawler.py b/crawler.py index fda9904f..0c3ea749 100755 --- a/crawler.py +++ b/crawler.py @@ -269,42 +269,41 @@ def internal_link(page_url, link_href): class Crawler: - def __init__(self, args): - self.browser = args.browser - self.browser_binary = args.browser_binary - self.chromedriver_path = args.chromedriver_path - self.domain_list = args.domain_list - self.exclude_suffixes = args.exclude + def __init__(self, opts): + self.browser_binary = opts.browser_binary + self.browser = opts.browser + self.chromedriver_path = opts.chromedriver_path + self.domain_list = opts.domain_list self.exclude_domains = get_recently_failed_domains() - self.firefox_tracking_protection = args.firefox_tracking_protection - self.load_extension = args.load_extension - self.no_blocking = args.no_blocking - self.load_data_ignore_sites = args.load_data_ignore_sites - self.num_sites = args.num_sites - self.out_dir = args.out_dir - self.pb_dir = args.pb_dir - self.take_screenshots = args.take_screenshots - self.timeout = args.timeout - self.wait_time = args.wait_time - - # version is based on when the crawl started + self.exclude_suffixes = opts.exclude + self.firefox_tracking_protection = opts.firefox_tracking_protection + self.last_data = None + self.load_data_ignore_sites = opts.load_data_ignore_sites + self.load_extension = opts.load_extension + self.logger = logging.getLogger() + self.no_blocking = opts.no_blocking + self.num_sites = opts.num_sites + self.out_dir = opts.out_dir + self.pb_dir = opts.pb_dir + self.take_screenshots = opts.take_screenshots + self.timeout = opts.timeout self.version = time.strftime('%Y.%-m.%-d', time.localtime()) + self.wait_time = opts.wait_time - self.last_data = None + pathlib.Path(self.out_dir).mkdir(exist_ok=True) - def init_logging(self): - self.logger = logging.getLogger() + def init_logging(self, log_stdout): self.logger.setLevel(logging.INFO) + log_fmt = logging.Formatter('%(asctime)s %(message)s') # by default, just log to file - pathlib.Path(self.out_dir).mkdir(exist_ok=True) fh = logging.FileHandler(os.path.join(self.out_dir, 'log.txt')) fh.setFormatter(log_fmt) self.logger.addHandler(fh) # log to stdout as well if configured - if args.log_stdout: + if log_stdout: sh = logging.StreamHandler(sys.stdout) sh.setFormatter(log_fmt) self.logger.addHandler(sh) @@ -1106,14 +1105,13 @@ def save(self, data, name='results.json'): if __name__ == '__main__': - ap = create_argument_parser() - args = ap.parse_args() + args = create_argument_parser().parse_args() # create an XVFB virtual display (to avoid opening an actual browser) with Xvfb(width=1920, height=1200) if not args.no_xvfb else contextlib.suppress(): crawler = Crawler(args) - crawler.init_logging() + crawler.init_logging(args.log_stdout) crawler.logger.info("Fetching TLD definitions ...") crawler.tld_extract = TLDExtract(cache_dir=False, include_psl_private_domains=True) diff --git a/requirements.txt b/requirements.txt index 6ef7c720..cbd60d84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ colorama==0.3.9 prospector==1.9.0 +pytest selenium tldextract==3.1.2 tranco==0.7.1 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/sitelist_test.py b/tests/sitelist_test.py new file mode 100644 index 00000000..ef63478c --- /dev/null +++ b/tests/sitelist_test.py @@ -0,0 +1,39 @@ +import pytest + +import crawler + +from tranco import Tranco + + +class TestSitelist: + + @pytest.mark.parametrize("num_sites, exclude, expected", [ + (10, None, ["example.com", "example.net", "example.org"]), + (1, None, ["example.com"]), + (10, ".com", ["example.net", "example.org"]), + (10, ".gov,.mil,.net,.org", ["example.com"]), + (1, ".gov", ["example.com"])]) + def test_exclude_suffixes(self, monkeypatch, num_sites, exclude, expected): + args = [f"--num-sites={num_sites}"] + if exclude: + args.append("--exclude=" + exclude) + cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args)) + + # mock out Tranco list + class MockResponse: + def top(self): + return ["example.com", "example.net", "example.org"] + + def mock_get(self, list_version): # pylint:disable=unused-argument + return MockResponse() + + monkeypatch.setattr(Tranco, "list", mock_get) + + # also clear exclude_domains + monkeypatch.setattr(cr, "exclude_domains", set()) + + assert cr.get_domain_list() == expected + + @pytest.mark.skip() + def test_recently_failed_domains(self): + pass