From 3202a68dfcfe99357cf83ec8f8695d319ffbf832 Mon Sep 17 00:00:00 2001
From: Eric Holscher <eric@ericholscher.com>
Date: Thu, 14 Mar 2024 12:05:26 -0700
Subject: [PATCH] Remove embedding code from public repo

This code is pretty specific to our workflow,
and adds a bunch of overhead to AnalyzedUrl's.
We're going to bring this small part of the code into a private repo,
similar to our ML modeling before it.
---
 .gitignore                                    |  3 +
 adserver/analyzer/backends/__init__.py        |  1 -
 adserver/analyzer/backends/st.py              | 52 ------------
 .../migrations/0005_remove_embedding.py       | 20 +++++
 adserver/analyzer/models.py                   |  2 -
 adserver/analyzer/tasks.py                    | 17 +++-
 adserver/analyzer/views.py                    | 82 +------------------
 adserver/api/urls.py                          |  4 +-
 config/settings/base.py                       | 12 +++
 docker-compose.yml                            |  2 +
 10 files changed, 55 insertions(+), 140 deletions(-)
 delete mode 100644 adserver/analyzer/backends/st.py
 create mode 100644 adserver/analyzer/migrations/0005_remove_embedding.py

diff --git a/.gitignore b/.gitignore
index 4341e3a0..fb63fe94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,9 @@ celerybeat-schedule
 celerybeat-schedule.db
 celerybeat.pid
 
+# VSCode
+.vscode
+
 
 ##########################################################################
 # Ad Server specific ignores
diff --git a/adserver/analyzer/backends/__init__.py b/adserver/analyzer/backends/__init__.py
index 1a20db1d..c48786af 100644
--- a/adserver/analyzer/backends/__init__.py
+++ b/adserver/analyzer/backends/__init__.py
@@ -1,5 +1,4 @@
 """Backends for analyzing URLs for keywords and topics."""
 from .eatopics import EthicalAdsTopicsBackend  # noqa
 from .naive import NaiveKeywordAnalyzerBackend  # noqa
-from .st import SentenceTransformerAnalyzerBackend  # noqa
 from .textacynlp import TextacyAnalyzerBackend  # noqa
diff --git a/adserver/analyzer/backends/st.py b/adserver/analyzer/backends/st.py
deleted file mode 100644
index f5339214..00000000
--- a/adserver/analyzer/backends/st.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import logging
-import os
-
-import trafilatura
-from bs4 import BeautifulSoup
-from sentence_transformers import SentenceTransformer
-from textacy import preprocessing
-
-from ...models import Topic
-from .base import BaseAnalyzerBackend
-
-log = logging.getLogger(__name__)  # noqa
-
-
-class SentenceTransformerAnalyzerBackend(BaseAnalyzerBackend):
-    """
-    Quick and dirty analyzer that uses the SentenceTransformer library
-    """
-
-    MODEL_NAME = os.getenv("SENTENCE_TRANSFORMERS_MODEL", "multi-qa-MiniLM-L6-cos-v1")
-    MODEL_HOME = os.getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp/sentence_transformers")
-
-    def preprocess_text(self, text):
-        log.info("Preprocessing text: %s", text)
-        self.preprocessor = preprocessing.make_pipeline(
-            preprocessing.normalize.unicode,
-            preprocessing.remove.punctuation,
-            preprocessing.normalize.whitespace,
-        )
-        return self.preprocessor(text).lower()[: self.MAX_INPUT_LENGTH]
-
-    def analyze_response(self, resp):
-        # Disable the analysis for now
-        return []
-
-    def get_content(self, *args):
-        downloaded = trafilatura.fetch_url(self.url)
-        result = trafilatura.extract(
-            downloaded, include_comments=False, include_tables=False
-        )
-        return self.preprocess_text(result)
-
-    def embed_response(self, resp) -> list:
-        """Analyze an HTTP response and return a list of keywords/topics for the URL."""
-        model = SentenceTransformer(self.MODEL_NAME, cache_folder=self.MODEL_HOME)
-        text = self.get_content(resp)
-        if text:
-            log.info("Postprocessed text: %s", text)
-            embedding = model.encode(text)
-            return embedding.tolist()
-
-        return None
diff --git a/adserver/analyzer/migrations/0005_remove_embedding.py b/adserver/analyzer/migrations/0005_remove_embedding.py
new file mode 100644
index 00000000..ad97f3fd
--- /dev/null
+++ b/adserver/analyzer/migrations/0005_remove_embedding.py
@@ -0,0 +1,20 @@
+# Generated by Django 4.2.11 on 2024-03-14 18:53
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("adserver_analyzer", "0004_add_embeddings"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="analyzedurl",
+            name="embedding",
+        ),
+        migrations.RemoveField(
+            model_name="historicalanalyzedurl",
+            name="embedding",
+        ),
+    ]
diff --git a/adserver/analyzer/models.py b/adserver/analyzer/models.py
index 1836fb5c..25fd6023 100644
--- a/adserver/analyzer/models.py
+++ b/adserver/analyzer/models.py
@@ -56,8 +56,6 @@ class AnalyzedUrl(TimeStampedModel):
         ),
     )
 
-    embedding = VectorField(dimensions=384, default=None, null=True, blank=True)
-
     history = HistoricalRecords()
 
     def __str__(self):
diff --git a/adserver/analyzer/tasks.py b/adserver/analyzer/tasks.py
index 62f48675..1756268e 100644
--- a/adserver/analyzer/tasks.py
+++ b/adserver/analyzer/tasks.py
@@ -17,6 +17,9 @@
 from .utils import normalize_url
 from config.celery_app import app
 
+if "ethicalads_ext" in settings.INSTALLED_APPS:
+    from ethicalads_ext.models import Embedding
+
 
 log = logging.getLogger(__name__)  # noqa
 
@@ -91,18 +94,28 @@ def analyze_url(url, publisher_slug, force=False):
         publisher=publisher,
         defaults={
             "keywords": keywords,
-            "embedding": embedding,
             "last_analyzed_date": timezone.now(),
         },
     )
 
     if not created:
         url_obj.keywords = keywords
-        url_obj.embedding = embedding
         url_obj.last_analyzed_date = timezone.now()
         url_obj.visits_since_last_analyzed = 0
         url_obj.save()
 
+    if "ethicalads_ext" in settings.INSTALLED_APPS:
+        embedding_obj, embedding_created = Embedding.objects.get_or_create(
+            url=url_obj,
+            model="v1",
+            defaults={
+                "embedding": embedding,
+            },
+        )
+        if not embedding_created:
+            embedding_obj.embedding = embedding
+            embedding_obj.save()
+
 
 @app.task
 def daily_visited_urls_aggregation(day=None):
diff --git a/adserver/analyzer/views.py b/adserver/analyzer/views.py
index 582926f4..afed4c7d 100644
--- a/adserver/analyzer/views.py
+++ b/adserver/analyzer/views.py
@@ -1,81 +1 @@
-from urllib.parse import urlparse
-
-from django.conf import settings
-from pgvector.django import CosineDistance
-from rest_framework import status
-from rest_framework.permissions import AllowAny
-from rest_framework.renderers import StaticHTMLRenderer
-from rest_framework.response import Response
-from rest_framework.views import APIView
-
-from adserver.analyzer.backends.st import SentenceTransformerAnalyzerBackend
-from adserver.analyzer.models import AnalyzedUrl
-
-
-if "adserver.analyzer" in settings.INSTALLED_APPS:
-
-    class EmbeddingViewSet(APIView):
-        """
-        Returns a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL.
-
-        Example: http://localhost:5000/api/v1/similar/?url=https://www.gitbook.com/
-
-        .. http:get:: /api/v1/embedding/
-
-            Return a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL
-
-            :<json string url: **Required**. The URL to query for similar URLs and scores
-
-            :>json int count: The number of similar URLs returned
-            :>json array results: An array of similar URLs and scores
-        """
-
-        permission_classes = [AllowAny]
-
-        def get(self, request):
-            """Return a list of similar URLs and scores based on querying the AnalyzedURL embedding for an incoming URL."""
-            url = request.query_params.get("url")
-
-            if not url:
-                return Response(
-                    {"error": "url is required"}, status=status.HTTP_400_BAD_REQUEST
-                )
-
-            backend_instance = SentenceTransformerAnalyzerBackend(url)
-            response = backend_instance.fetch()
-            if not response:
-                return Response(
-                    {"error": "Not able to fetch content from URL"},
-                    status=status.HTTP_400_BAD_REQUEST,
-                )
-            processed_text = backend_instance.get_content(response)
-            analyzed_embedding = backend_instance.embedding(response)
-
-            unfiltered_urls = (
-                AnalyzedUrl.objects.filter(publisher__allow_paid_campaigns=True)
-                .exclude(embedding=None)
-                .annotate(distance=CosineDistance("embedding", analyzed_embedding))
-                .order_by("distance")[:25]
-            )
-
-            # Filter urls to ensure each domain is unique
-            unique_domains = set()
-            urls = []
-            for url in unfiltered_urls:
-                domain = urlparse(url.url).netloc
-                if domain not in unique_domains:
-                    unique_domains.add(domain)
-                    urls.append(url)
-
-            if not len(urls) > 3:
-                return Response(
-                    {"error": "No similar URLs found"}, status=status.HTTP_404_NOT_FOUND
-                )
-
-            return Response(
-                {
-                    "count": len(urls),
-                    "text": processed_text[:500],
-                    "results": [[url.url, url.distance] for url in urls],
-                }
-            )
+# Left blank
diff --git a/adserver/api/urls.py b/adserver/api/urls.py
index 27a63e30..a81c4e43 100644
--- a/adserver/api/urls.py
+++ b/adserver/api/urls.py
@@ -16,8 +16,8 @@
 router.register(r"advertisers", AdvertiserViewSet, basename="advertisers")
 router.register(r"publishers", PublisherViewSet, basename="publishers")
 
-if "adserver.analyzer" in settings.INSTALLED_APPS:
-    from adserver.analyzer.views import EmbeddingViewSet
+if "ethicalads_ext.embedding" in settings.INSTALLED_APPS:
+    from ethicalads_ext.embedding.views import EmbeddingViewSet
 
     urlpatterns += [path(r"similar/", EmbeddingViewSet.as_view(), name="similar")]
 
diff --git a/config/settings/base.py b/config/settings/base.py
index 9c411764..fad1eb7b 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -23,6 +23,14 @@
 except ImproperlyConfigured:
     log.info("Unable to read env file. Assuming environment is already set.")
 
+# This is a bit of a hack to allow us to import the ethicalads_ext package
+# which contains private extensions to the ad server.
+try:
+    import ethicalads_ext  # noqa
+
+    ext = True
+except ImportError:
+    ext = False
 
 # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 BASE_DIR = os.path.abspath(
@@ -69,6 +77,10 @@
     "corsheaders",
 ]
 
+
+if ext:
+    INSTALLED_APPS.append("ethicalads_ext.embedding")
+
 MIDDLEWARE = [
     "django.middleware.security.SecurityMiddleware",
     "enforce_host.EnforceHostMiddleware",
diff --git a/docker-compose.yml b/docker-compose.yml
index 63e2db83..bbc288bf 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -28,6 +28,8 @@ services:
       # Make it so we can edit the start script dynamically,
       # for example to install dependencies
       - ./docker-compose/django/start:/start
+      # Load the ethicalads_ext code from the host, so we don't have to rebuild
+      - ${PWD}/${EA_EXT_PATH:-../ethicalads-ext/ethicalads_ext}:/app/ethicalads_ext
     env_file:
       - ./.envs/local/django
       - ./.envs/local/postgres