Skip to content

Commit

Permalink
Cache PSL data for one week in system tempdir
Browse files Browse the repository at this point in the history
  • Loading branch information
ghostwords committed Mar 4, 2024
1 parent 91f34bd commit 4c369ef
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 7 deletions.
9 changes: 3 additions & 6 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tldextract import TLDExtract
from tranco import Tranco
from xvfbwrapper import Xvfb

from lib.basedomain import extract


CHROME_EXT_ID = 'mcgekeccgjgcmhnhbabplanchdogjcnh'
CHROME_URL_PREFIX = 'chrome-extension://'
Expand Down Expand Up @@ -1073,8 +1074,7 @@ def cleanup(self, d1, d2):
self.logger.info(str(snitch_map['']))
del snitch_map['']

# TODO once the need for this is gone, should be able to get rid of tldextract, in this script anyway
d1_base = self.tld_extract(d1).registered_domain
d1_base = extract(d1).registered_domain
if not d1_base:
d1_base = d1

Expand Down Expand Up @@ -1153,9 +1153,6 @@ def save(self, data, name='results.json'):

crawler.init_logging(args.log_stdout)

crawler.logger.info("Fetching TLD definitions ...")
crawler.tld_extract = TLDExtract(cache_dir=False, include_psl_private_domains=True)

crawler.start_browser()

crawler.log_scan_summary()
Expand Down
14 changes: 13 additions & 1 deletion lib/basedomain.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/usr/bin/env python3

import os
import shutil
import tempfile
import tldextract

from datetime import datetime, timedelta

_extract = None

Expand All @@ -10,6 +14,14 @@ def extract(domain):

# lazy init
if not _extract:
_extract = tldextract.TLDExtract(cache_dir=False, include_psl_private_domains=True)
cache_dir = os.path.join(tempfile.gettempdir(), "python-tldextract")

# expire PSL cache after one week
elapsed_time = datetime.now() - datetime.fromtimestamp(os.stat(cache_dir).st_mtime)
if elapsed_time >= timedelta(weeks=1):
shutil.rmtree(cache_dir)

_extract = tldextract.TLDExtract(cache_dir=cache_dir,
include_psl_private_domains=True)

return _extract(domain)

0 comments on commit 4c369ef

Please sign in to comment.