From 4c369efd05d8c385e721f19a0d73efe6029d06a6 Mon Sep 17 00:00:00 2001 From: Alexei Date: Mon, 4 Mar 2024 09:00:02 -0500 Subject: [PATCH] Cache PSL data for one week in system tempdir --- crawler.py | 9 +++------ lib/basedomain.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/crawler.py b/crawler.py index 01ad5b11..0c5cfe2a 100755 --- a/crawler.py +++ b/crawler.py @@ -44,10 +44,11 @@ from selenium.webdriver.firefox.service import Service as FirefoxService from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from tldextract import TLDExtract from tranco import Tranco from xvfbwrapper import Xvfb +from lib.basedomain import extract + CHROME_EXT_ID = 'mcgekeccgjgcmhnhbabplanchdogjcnh' CHROME_URL_PREFIX = 'chrome-extension://' @@ -1073,8 +1074,7 @@ def cleanup(self, d1, d2): self.logger.info(str(snitch_map[''])) del snitch_map[''] - # TODO once the need for this is gone, should be able to get rid of tldextract, in this script anyway - d1_base = self.tld_extract(d1).registered_domain + d1_base = extract(d1).registered_domain if not d1_base: d1_base = d1 @@ -1153,9 +1153,6 @@ def save(self, data, name='results.json'): crawler.init_logging(args.log_stdout) - crawler.logger.info("Fetching TLD definitions ...") - crawler.tld_extract = TLDExtract(cache_dir=False, include_psl_private_domains=True) - crawler.start_browser() crawler.log_scan_summary() diff --git a/lib/basedomain.py b/lib/basedomain.py index a416e1f6..4bbf476b 100644 --- a/lib/basedomain.py +++ b/lib/basedomain.py @@ -1,7 +1,11 @@ #!/usr/bin/env python3 +import os +import shutil +import tempfile import tldextract +from datetime import datetime, timedelta _extract = None @@ -10,6 +14,14 @@ def extract(domain): # lazy init if not _extract: - _extract = tldextract.TLDExtract(cache_dir=False, include_psl_private_domains=True) + cache_dir = os.path.join(tempfile.gettempdir(), "python-tldextract") + + # expire PSL cache after one week + elapsed_time = datetime.now() - datetime.fromtimestamp(os.stat(cache_dir).st_mtime) + if elapsed_time >= timedelta(weeks=1): + shutil.rmtree(cache_dir) + + _extract = tldextract.TLDExtract(cache_dir=cache_dir, + include_psl_private_domains=True) return _extract(domain)