Skip to content

Commit

Permalink
Allow running multiple analyzer models
Browse files Browse the repository at this point in the history
The setting `settings.ADSERVER_ANALYZER_BACKEND` can now be a list
and multiple analyzers will be run and combined.
  • Loading branch information
davidfischer committed Oct 27, 2023
1 parent b9a8272 commit d5a2703
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 16 deletions.
2 changes: 1 addition & 1 deletion adserver/analyzer/backends/textacynlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class TextacyAnalyzerBackend(NaiveKeywordAnalyzerBackend):
https://textacy.readthedocs.io/en/latest/quickstart.html
"""

TOP_PHRASE_COUNT = 20
TOP_PHRASE_COUNT = 50

# Minimum phrase length where each word isn't required to be in the output phrase
MIN_PHRASE_LENGTH = 6
Expand Down
20 changes: 13 additions & 7 deletions adserver/analyzer/management/commands/runmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from django.core.validators import URLValidator
from django.utils.translation import gettext_lazy as _

from ...utils import get_url_analyzer_backend
from ...utils import get_url_analyzer_backends


class Command(BaseCommand):
Expand All @@ -24,7 +24,7 @@ def add_arguments(self, parser):
def handle(self, *args, **kwargs):
"""Entrypoint to the command."""
self.stdout.write(
_("Using the model from %s") % settings.ADSERVER_ANALYZER_BACKEND
_("Using the model(s) from %s") % settings.ADSERVER_ANALYZER_BACKEND
)

for url in kwargs["urls"]:
Expand All @@ -36,10 +36,16 @@ def handle_url(self, url):
"""Dump questions from metabase to a file."""
self.stdout.write(_("Running against %s") % url)

backend = get_url_analyzer_backend()(url)
keywords = backend.analyze()

if keywords is None:
self.stderr.write(_("Failed to connect/process %s") % url)
keywords = []
for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze()
self.stdout.write(
_("Keywords from '%s': %s") % (backend.__name__, analyzed_keywords)
)

if analyzed_keywords:
for kw in analyzed_keywords:
keywords.append(kw)

self.stdout.write(_("Keywords/topics: %s") % keywords)
15 changes: 12 additions & 3 deletions adserver/analyzer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ..models import Publisher
from ..utils import get_day
from .models import AnalyzedUrl
from .utils import get_url_analyzer_backend
from .utils import get_url_analyzer_backends
from .utils import normalize_url
from config.celery_app import app

Expand Down Expand Up @@ -48,10 +48,19 @@ def analyze_url(url, publisher_slug):
return

log.debug("Analyzing url: %s", normalized_url)
keywords = set()

backend = get_url_analyzer_backend()(url)
keywords = backend.analyze() # Can be None
for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze() # Can be None
log.debug("Keywords from '%s': %s", backend.__name__, analyzed_keywords)
if analyzed_keywords:
for kw in analyzed_keywords:
keywords.add(kw)

log.debug("Keywords found : %s", keywords)

keywords = list(keywords)
url_obj, created = AnalyzedUrl.objects.get_or_create(
url=normalized_url,
publisher=publisher,
Expand Down
12 changes: 11 additions & 1 deletion adserver/analyzer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,18 @@
from .constants import IGNORED_QUERY_PARAMS


def get_url_analyzer_backends():
for backend in settings.ADSERVER_ANALYZER_BACKEND:
if backend:
yield import_string(backend)


def get_url_analyzer_backend():
return import_string(settings.ADSERVER_ANALYZER_BACKEND)
backends = list(get_url_analyzer_backends())
if backends:
return backends[0]

return None


def normalize_url(url):
Expand Down
8 changes: 4 additions & 4 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,12 +487,12 @@
default="adserver.decisionengine.backends.ProbabilisticFlightBackend",
)

# The backend to be used by the ad server
# The backend(s) to be used by the ad server
# for topic and keyword analysis
# Set to `None` to disable the analyzer entirely
ADSERVER_ANALYZER_BACKEND = env(
# Set to `None` or an empty string to disable the analyzer entirely
ADSERVER_ANALYZER_BACKEND = env.list(
"ADSERVER_ANALYZER_BACKEND",
default="adserver.analyzer.backends.TextacyAnalyzerBackend",
default=["adserver.analyzer.backends.TextacyAnalyzerBackend"],
)
if ADSERVER_ANALYZER_BACKEND:
INSTALLED_APPS.append("adserver.analyzer")
Expand Down

0 comments on commit d5a2703

Please sign in to comment.