Skip to content

Commit

Permalink
Use latest Ghostery blocklist
Browse files Browse the repository at this point in the history
  • Loading branch information
ghostwords committed Dec 27, 2024
1 parent 71c90a1 commit 3c29af5
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 43 deletions.
23 changes: 16 additions & 7 deletions lib/lists/blocklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,23 @@ def _download(self, url, filename):
with open(filename, 'w', encoding='utf-8') as file:
file.write(data.decode('utf-8'))

def exists_and_unexpired(self, filename, expire_cache_hrs):
if not os.path.isfile(filename):
return False

time_diff = time.time() - os.path.getmtime(filename)
if time_diff / 3600 > expire_cache_hrs:
return False

return True

def fetch(self, url, filename, expire_cache_hrs=24):
os.makedirs(self.cache_dir, exist_ok=True)

if not os.path.isfile(filename):
self._download(url, filename)
# redownload if cached file is older than specified span of hours
elif (time.time() - os.path.getmtime(filename)) / 3600 > expire_cache_hrs:
# first remove (back up) the file so that if downloading fails,
# we know something went wrong
os.replace(filename, filename + ".bak")
if not self.exists_and_unexpired(filename, expire_cache_hrs):
if os.path.isfile(filename):
# first remove (back up) the file so that if downloading fails,
# we know something went wrong
os.replace(filename, filename + ".bak")

self._download(url, filename)
55 changes: 19 additions & 36 deletions lib/lists/ghostery.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,12 @@

import json
import os

from collections.abc import MutableMapping
import urllib

from lib.basedomain import extract
from lib.lists.blocklist import Blocklist


# https://stackoverflow.com/a/6027615
def flatten(dictionary, parent_key='', separator='_'):
items = []
for key, value in dictionary.items():
new_key = parent_key + separator + key if parent_key else key
if isinstance(value, MutableMapping):
items.extend(flatten(value, new_key, separator).items())
else:
items.append((new_key, value))
return dict(items)


class Ghostery(Blocklist):

bases = set()
Expand All @@ -30,10 +17,18 @@ class Ghostery(Blocklist):
blocked_categories = ("advertising", "site_analytics", "pornvertising")

def __init__(self):
url = "https://cdn.ghostery.com/update/v4.1/bugs.json"
filename = os.path.join(self.cache_dir, "ghostery-bugs.json")
filename = os.path.join(self.cache_dir, "ghostery-trackerdb.json")
expire_hrs = 168 # weekly expiration

if not self.exists_and_unexpired(filename, expire_hrs):
url = "https://github.com/ghostery/trackerdb/releases/latest"
with urllib.request.urlopen(
urllib.request.Request(url, method='HEAD')) as conn:
version = conn.geturl().rpartition('/')[-1]

self.fetch(url, filename, expire_cache_hrs=168) # weekly expiration
url = ("https://github.com/ghostery/trackerdb/releases"
f"/download/{version}/trackerdb.json")
self.fetch(url, filename, expire_cache_hrs=expire_hrs)

try:
with open(filename, encoding='utf-8') as file:
Expand All @@ -42,26 +37,14 @@ def __init__(self):
print(f"WARNING Failed to open {filename}")
return

# TODO review if we can ingest some domains from other pattern types, not just "host"

# since '_' is a valid domain names character, '_' is a bad separator
# for working with domains names; let's use ':' instead
host_patterns_flat = flatten(data["patterns"]["host"], separator=':')

for domain_key, bug_id in host_patterns_flat.items():
# trim the last segment ("_$") and then reverse
domain = ".".join(domain_key.split(":")[:-1][::-1])

base = extract(domain).registered_domain
if not base:
base = domain
self.bases.add(base)
for name in data['patterns']:
for domain in data['patterns'][name]['domains']:
base = extract(domain).registered_domain or domain
self.bases.add(base)

aid = data["bugs"][str(bug_id)]["aid"]
category = data["apps"][str(aid)]["cat"]
if category not in self.blocked_categories:
self.bases_unblocked.add(base)
if data['patterns'][name]['category'] not in self.blocked_categories:
self.bases_unblocked.add(base)

self.domains.add(domain)
self.domains.add(domain)

self.ready = True

0 comments on commit 3c29af5

Please sign in to comment.