Skip to content

Commit

Permalink
Add unit tests for --exclude flag
Browse files Browse the repository at this point in the history
  • Loading branch information
ghostwords committed Feb 19, 2024
1 parent 2f0b403 commit bcce4aa
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 31 deletions.
15 changes: 10 additions & 5 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
name: Python app
name: Static analysis checks and unit tests

on: [push]
on: [pull_request, push, workflow_dispatch]

jobs:
lint:

lint_and_tests:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run Prospector
- name: Run static analysis
run: prospector -X

- name: Run unit tests
run: pytest
2 changes: 2 additions & 0 deletions .prospector.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
strictness: medium

test-warnings: true

pylint:
disable:
- invalid-name
50 changes: 24 additions & 26 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,42 +269,41 @@ def internal_link(page_url, link_href):


class Crawler:
def __init__(self, args):
self.browser = args.browser
self.browser_binary = args.browser_binary
self.chromedriver_path = args.chromedriver_path
self.domain_list = args.domain_list
self.exclude_suffixes = args.exclude
def __init__(self, opts):
self.browser_binary = opts.browser_binary
self.browser = opts.browser
self.chromedriver_path = opts.chromedriver_path
self.domain_list = opts.domain_list
self.exclude_domains = get_recently_failed_domains()
self.firefox_tracking_protection = args.firefox_tracking_protection
self.load_extension = args.load_extension
self.no_blocking = args.no_blocking
self.load_data_ignore_sites = args.load_data_ignore_sites
self.num_sites = args.num_sites
self.out_dir = args.out_dir
self.pb_dir = args.pb_dir
self.take_screenshots = args.take_screenshots
self.timeout = args.timeout
self.wait_time = args.wait_time

# version is based on when the crawl started
self.exclude_suffixes = opts.exclude
self.firefox_tracking_protection = opts.firefox_tracking_protection
self.last_data = None
self.load_data_ignore_sites = opts.load_data_ignore_sites
self.load_extension = opts.load_extension
self.logger = logging.getLogger()
self.no_blocking = opts.no_blocking
self.num_sites = opts.num_sites
self.out_dir = opts.out_dir
self.pb_dir = opts.pb_dir
self.take_screenshots = opts.take_screenshots
self.timeout = opts.timeout
self.version = time.strftime('%Y.%-m.%-d', time.localtime())
self.wait_time = opts.wait_time

self.last_data = None
pathlib.Path(self.out_dir).mkdir(exist_ok=True)

def init_logging(self):
self.logger = logging.getLogger()
def init_logging(self, log_stdout):
self.logger.setLevel(logging.INFO)

log_fmt = logging.Formatter('%(asctime)s %(message)s')

# by default, just log to file
pathlib.Path(self.out_dir).mkdir(exist_ok=True)
fh = logging.FileHandler(os.path.join(self.out_dir, 'log.txt'))
fh.setFormatter(log_fmt)
self.logger.addHandler(fh)

# log to stdout as well if configured
if args.log_stdout:
if log_stdout:
sh = logging.StreamHandler(sys.stdout)
sh.setFormatter(log_fmt)
self.logger.addHandler(sh)
Expand Down Expand Up @@ -1106,14 +1105,13 @@ def save(self, data, name='results.json'):


if __name__ == '__main__':
ap = create_argument_parser()
args = ap.parse_args()
args = create_argument_parser().parse_args()

# create an XVFB virtual display (to avoid opening an actual browser)
with Xvfb(width=1920, height=1200) if not args.no_xvfb else contextlib.suppress():
crawler = Crawler(args)

crawler.init_logging()
crawler.init_logging(args.log_stdout)

crawler.logger.info("Fetching TLD definitions ...")
crawler.tld_extract = TLDExtract(cache_dir=False, include_psl_private_domains=True)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
colorama==0.3.9
prospector==1.9.0
pytest
selenium
tldextract==3.1.2
tranco==0.7.1
Expand Down
Empty file added tests/__init__.py
Empty file.
39 changes: 39 additions & 0 deletions tests/sitelist_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest

import crawler

from tranco import Tranco


class TestSitelist:

@pytest.mark.parametrize("num_sites, exclude, expected", [
(10, None, ["example.com", "example.net", "example.org"]),
(1, None, ["example.com"]),
(10, ".com", ["example.net", "example.org"]),
(10, ".gov,.mil,.net,.org", ["example.com"]),
(1, ".gov", ["example.com"])])
def test_exclude_suffixes(self, monkeypatch, num_sites, exclude, expected):
args = [f"--num-sites={num_sites}"]
if exclude:
args.append("--exclude=" + exclude)
cr = crawler.Crawler(crawler.create_argument_parser().parse_args(args))

# mock out Tranco list
class MockResponse:
def top(self):
return ["example.com", "example.net", "example.org"]

def mock_get(self, list_version): # pylint:disable=unused-argument
return MockResponse()

monkeypatch.setattr(Tranco, "list", mock_get)

# also clear exclude_domains
monkeypatch.setattr(cr, "exclude_domains", set())

assert cr.get_domain_list() == expected

@pytest.mark.skip()
def test_recently_failed_domains(self):
pass

0 comments on commit bcce4aa

Please sign in to comment.