Skip to content

Commit

Permalink
Preproces input and store model locally
Browse files Browse the repository at this point in the history
  • Loading branch information
ericholscher committed Feb 2, 2024
1 parent a791927 commit 9dc8728
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
32 changes: 26 additions & 6 deletions adserver/analyzer/backends/st.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import logging
import os

from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer

from ...models import Topic
from .base import BaseAnalyzerBackend
from .textacynlp import TextacyAnalyzerBackend

log = logging.getLogger(__name__) # noqa


class SentenceTransformerAnalyzerBackend(BaseAnalyzerBackend):
class SentenceTransformerAnalyzerBackend(TextacyAnalyzerBackend):
"""
Quick and dirty analyzer that uses the SentenceTransformer library
Expand Down Expand Up @@ -38,9 +40,27 @@ def analyze_response(self, resp):

def embed_response(self, resp) -> list:
"""Analyze an HTTP response and return a list of keywords/topics for the URL."""
keywords = []
model = SentenceTransformer(
"multi-qa-MiniLM-L6-cos-v1",
cache_folder=os.getenv("SENTENCE_TRANSFORMERS_HOME", "/tmp"),
)

model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
embedding = model.encode(str(resp.content))
soup = BeautifulSoup(resp.content, features="html.parser")

return embedding.tolist()
for selector in self.REMOVE_CONTENT_SELECTORS:
for nodes in soup.select(selector):
nodes.decompose()

for selector in self.MAIN_CONTENT_SELECTORS:
results = soup.select(selector, limit=1)

# If no results, go to the next selector
# If results are found, use these and stop looking at the selectors
if results:
text = self.preprocess_text(results[0].get_text())
log.info("Embedding text: %s", text[:100])
embedding = model.encode(text)

return embedding.tolist()

return None
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ services:
- postgres
volumes:
- .:/app
- ../ethicalads-model:/model
- /tmp:/tmp
env_file:
- ./.envs/local/django
- ./.envs/local/postgres
Expand Down

0 comments on commit 9dc8728

Please sign in to comment.