From 3202a68dfcfe99357cf83ec8f8695d319ffbf832 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 14 Mar 2024 12:05:26 -0700 Subject: [PATCH] Remove embedding code from public repo This code is pretty specific to our workflow, and adds a bunch of overhead to AnalyzedUrl's. We're going to bring this small part of the code into a private repo, similar to our ML modeling before it. --- .gitignore | 3 + adserver/analyzer/backends/__init__.py | 1 - adserver/analyzer/backends/st.py | 52 ------------ .../migrations/0005_remove_embedding.py | 20 +++++ adserver/analyzer/models.py | 2 - adserver/analyzer/tasks.py | 17 +++- adserver/analyzer/views.py | 82 +------------------ adserver/api/urls.py | 4 +- config/settings/base.py | 12 +++ docker-compose.yml | 2 + 10 files changed, 55 insertions(+), 140 deletions(-) delete mode 100644 adserver/analyzer/backends/st.py create mode 100644 adserver/analyzer/migrations/0005_remove_embedding.py diff --git a/.gitignore b/.gitignore index 4341e3a0..fb63fe94 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,9 @@ celerybeat-schedule celerybeat-schedule.db celerybeat.pid +# VSCode +.vscode + ########################################################################## # Ad Server specific ignores diff --git a/adserver/analyzer/backends/__init__.py b/adserver/analyzer/backends/__init__.py index 1a20db1d..c48786af 100644 --- a/adserver/analyzer/backends/__init__.py +++ b/adserver/analyzer/backends/__init__.py @@ -1,5 +1,4 @@ """Backends for analyzing URLs for keywords and topics.""" from .eatopics import EthicalAdsTopicsBackend # noqa from .naive import NaiveKeywordAnalyzerBackend # noqa -from .st import SentenceTransformerAnalyzerBackend # noqa from .textacynlp import TextacyAnalyzerBackend # noqa diff --git a/adserver/analyzer/backends/st.py b/adserver/analyzer/backends/st.py deleted file mode 100644 index f5339214..00000000 --- a/adserver/analyzer/backends/st.py +++ /dev/null @@ -1,52 +0,0 @@ -import logging -import os - -import trafilatura -from bs4 import BeautifulSoup -from sentence_transformers import SentenceTransformer -from textacy import preprocessing - -from ...models import Topic -from .base import BaseAnalyzerBackend - -log = logging.getLogger(__name__) # noqa - - -class SentenceTransformerAnalyzerBackend(BaseAnalyzerBackend): - """ - Quick and dirty analyzer that uses the SentenceTransformer library - """ - - MODEL_NAME = os.getenv("SENTENCE_TRANSFORMERS_MODEL", "multi-qa-MiniLM-L6-cos-v1") - MODEL_HOME = os.getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp/sentence_transformers") - - def preprocess_text(self, text): - log.info("Preprocessing text: %s", text) - self.preprocessor = preprocessing.make_pipeline( - preprocessing.normalize.unicode, - preprocessing.remove.punctuation, - preprocessing.normalize.whitespace, - ) - return self.preprocessor(text).lower()[: self.MAX_INPUT_LENGTH] - - def analyze_response(self, resp): - # Disable the analysis for now - return [] - - def get_content(self, *args): - downloaded = trafilatura.fetch_url(self.url) - result = trafilatura.extract( - downloaded, include_comments=False, include_tables=False - ) - return self.preprocess_text(result) - - def embed_response(self, resp) -> list: - """Analyze an HTTP response and return a list of keywords/topics for the URL.""" - model = SentenceTransformer(self.MODEL_NAME, cache_folder=self.MODEL_HOME) - text = self.get_content(resp) - if text: - log.info("Postprocessed text: %s", text) - embedding = model.encode(text) - return embedding.tolist() - - return None diff --git a/adserver/analyzer/migrations/0005_remove_embedding.py b/adserver/analyzer/migrations/0005_remove_embedding.py new file mode 100644 index 00000000..ad97f3fd --- /dev/null +++ b/adserver/analyzer/migrations/0005_remove_embedding.py @@ -0,0 +1,20 @@ +# Generated by Django 4.2.11 on 2024-03-14 18:53 +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("adserver_analyzer", "0004_add_embeddings"), + ] + + operations = [ + migrations.RemoveField( + model_name="analyzedurl", + name="embedding", + ), + migrations.RemoveField( + model_name="historicalanalyzedurl", + name="embedding", + ), + ] diff --git a/adserver/analyzer/models.py b/adserver/analyzer/models.py index 1836fb5c..25fd6023 100644 --- a/adserver/analyzer/models.py +++ b/adserver/analyzer/models.py @@ -56,8 +56,6 @@ class AnalyzedUrl(TimeStampedModel): ), ) - embedding = VectorField(dimensions=384, default=None, null=True, blank=True) - history = HistoricalRecords() def __str__(self): diff --git a/adserver/analyzer/tasks.py b/adserver/analyzer/tasks.py index 62f48675..1756268e 100644 --- a/adserver/analyzer/tasks.py +++ b/adserver/analyzer/tasks.py @@ -17,6 +17,9 @@ from .utils import normalize_url from config.celery_app import app +if "ethicalads_ext" in settings.INSTALLED_APPS: + from ethicalads_ext.models import Embedding + log = logging.getLogger(__name__) # noqa @@ -91,18 +94,28 @@ def analyze_url(url, publisher_slug, force=False): publisher=publisher, defaults={ "keywords": keywords, - "embedding": embedding, "last_analyzed_date": timezone.now(), }, ) if not created: url_obj.keywords = keywords - url_obj.embedding = embedding url_obj.last_analyzed_date = timezone.now() url_obj.visits_since_last_analyzed = 0 url_obj.save() + if "ethicalads_ext" in settings.INSTALLED_APPS: + embedding_obj, embedding_created = Embedding.objects.get_or_create( + url=url_obj, + model="v1", + defaults={ + "embedding": embedding, + }, + ) + if not embedding_created: + embedding_obj.embedding = embedding + embedding_obj.save() + @app.task def daily_visited_urls_aggregation(day=None): diff --git a/adserver/analyzer/views.py b/adserver/analyzer/views.py index 582926f4..afed4c7d 100644 --- a/adserver/analyzer/views.py +++ b/adserver/analyzer/views.py @@ -1,81 +1 @@ -from urllib.parse import urlparse - -from django.conf import settings -from pgvector.django import CosineDistance -from rest_framework import status -from rest_framework.permissions import AllowAny -from rest_framework.renderers import StaticHTMLRenderer -from rest_framework.response import Response -from rest_framework.views import APIView - -from adserver.analyzer.backends.st import SentenceTransformerAnalyzerBackend -from adserver.analyzer.models import AnalyzedUrl - - -if "adserver.analyzer" in settings.INSTALLED_APPS: - - class EmbeddingViewSet(APIView): - """ - Returns a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL. - - Example: http://localhost:5000/api/v1/similar/?url=https://www.gitbook.com/ - - .. http:get:: /api/v1/embedding/ - - Return a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL - - :json int count: The number of similar URLs returned - :>json array results: An array of similar URLs and scores - """ - - permission_classes = [AllowAny] - - def get(self, request): - """Return a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL.""" - url = request.query_params.get("url") - - if not url: - return Response( - {"error": "url is required"}, status=status.HTTP_400_BAD_REQUEST - ) - - backend_instance = SentenceTransformerAnalyzerBackend(url) - response = backend_instance.fetch() - if not response: - return Response( - {"error": "Not able to fetch content from URL"}, - status=status.HTTP_400_BAD_REQUEST, - ) - processed_text = backend_instance.get_content(response) - analyzed_embedding = backend_instance.embedding(response) - - unfiltered_urls = ( - AnalyzedUrl.objects.filter(publisher__allow_paid_campaigns=True) - .exclude(embedding=None) - .annotate(distance=CosineDistance("embedding", analyzed_embedding)) - .order_by("distance")[:25] - ) - - # Filter urls to ensure each domain is unique - unique_domains = set() - urls = [] - for url in unfiltered_urls: - domain = urlparse(url.url).netloc - if domain not in unique_domains: - unique_domains.add(domain) - urls.append(url) - - if not len(urls) > 3: - return Response( - {"error": "No similar URLs found"}, status=status.HTTP_404_NOT_FOUND - ) - - return Response( - { - "count": len(urls), - "text": processed_text[:500], - "results": [[url.url, url.distance] for url in urls], - } - ) +# Left blank diff --git a/adserver/api/urls.py b/adserver/api/urls.py index 27a63e30..a81c4e43 100644 --- a/adserver/api/urls.py +++ b/adserver/api/urls.py @@ -16,8 +16,8 @@ router.register(r"advertisers", AdvertiserViewSet, basename="advertisers") router.register(r"publishers", PublisherViewSet, basename="publishers") -if "adserver.analyzer" in settings.INSTALLED_APPS: - from adserver.analyzer.views import EmbeddingViewSet +if "ethicalads_ext.embedding" in settings.INSTALLED_APPS: + from ethicalads_ext.embedding.views import EmbeddingViewSet urlpatterns += [path(r"similar/", EmbeddingViewSet.as_view(), name="similar")] diff --git a/config/settings/base.py b/config/settings/base.py index 9c411764..fad1eb7b 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -23,6 +23,14 @@ except ImproperlyConfigured: log.info("Unable to read env file. Assuming environment is already set.") +# This is a bit of a hack to allow us to import the ethicalads_ext package +# which contains private extensions to the ad server. +try: + import ethicalads_ext # noqa + + ext = True +except ImportError: + ext = False # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.abspath( @@ -69,6 +77,10 @@ "corsheaders", ] + +if ext: + INSTALLED_APPS.append("ethicalads_ext.embedding") + MIDDLEWARE = [ "django.middleware.security.SecurityMiddleware", "enforce_host.EnforceHostMiddleware", diff --git a/docker-compose.yml b/docker-compose.yml index 63e2db83..bbc288bf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,6 +28,8 @@ services: # Make it so we can edit the start script dynamically, # for example to install dependencies - ./docker-compose/django/start:/start + # Load the ethicalads_ext code from the host, so we don't have to rebuild + - ${PWD}/${EA_EXT_PATH:-../ethicalads-ext/ethicalads_ext}:/app/ethicalads_ext env_file: - ./.envs/local/django - ./.envs/local/postgres