diff --git a/adserver/analyzer/backends/textacynlp.py b/adserver/analyzer/backends/textacynlp.py index 78f46e5b..bf1a0f0d 100644 --- a/adserver/analyzer/backends/textacynlp.py +++ b/adserver/analyzer/backends/textacynlp.py @@ -19,7 +19,7 @@ class TextacyAnalyzerBackend(NaiveKeywordAnalyzerBackend): https://textacy.readthedocs.io/en/latest/quickstart.html """ - TOP_PHRASE_COUNT = 20 + TOP_PHRASE_COUNT = 50 # Minimum phrase length where each word isn't required to be in the output phrase MIN_PHRASE_LENGTH = 6 diff --git a/adserver/analyzer/management/commands/runmodel.py b/adserver/analyzer/management/commands/runmodel.py index c2de5e83..c4b90069 100644 --- a/adserver/analyzer/management/commands/runmodel.py +++ b/adserver/analyzer/management/commands/runmodel.py @@ -4,7 +4,7 @@ from django.core.validators import URLValidator from django.utils.translation import gettext_lazy as _ -from ...utils import get_url_analyzer_backend +from ...utils import get_url_analyzer_backends class Command(BaseCommand): @@ -24,7 +24,7 @@ def add_arguments(self, parser): def handle(self, *args, **kwargs): """Entrypoint to the command.""" self.stdout.write( - _("Using the model from %s") % settings.ADSERVER_ANALYZER_BACKEND + _("Using the model(s) from %s") % settings.ADSERVER_ANALYZER_BACKEND ) for url in kwargs["urls"]: @@ -36,10 +36,16 @@ def handle_url(self, url): """Dump questions from metabase to a file.""" self.stdout.write(_("Running against %s") % url) - backend = get_url_analyzer_backend()(url) - keywords = backend.analyze() - - if keywords is None: - self.stderr.write(_("Failed to connect/process %s") % url) + keywords = [] + for backend in get_url_analyzer_backends(): + backend_instance = backend(url) + analyzed_keywords = backend_instance.analyze() + self.stdout.write( + _("Keywords from '%s': %s") % (backend.__name__, analyzed_keywords) + ) + + if analyzed_keywords: + for kw in analyzed_keywords: + keywords.append(kw) self.stdout.write(_("Keywords/topics: %s") % keywords) diff --git a/adserver/analyzer/tasks.py b/adserver/analyzer/tasks.py index 1ce9649d..0970df3d 100644 --- a/adserver/analyzer/tasks.py +++ b/adserver/analyzer/tasks.py @@ -13,7 +13,7 @@ from ..models import Publisher from ..utils import get_day from .models import AnalyzedUrl -from .utils import get_url_analyzer_backend +from .utils import get_url_analyzer_backends from .utils import normalize_url from config.celery_app import app @@ -48,10 +48,19 @@ def analyze_url(url, publisher_slug): return log.debug("Analyzing url: %s", normalized_url) + keywords = set() - backend = get_url_analyzer_backend()(url) - keywords = backend.analyze() # Can be None + for backend in get_url_analyzer_backends(): + backend_instance = backend(url) + analyzed_keywords = backend_instance.analyze() # Can be None + log.debug("Keywords from '%s': %s", backend.__name__, analyzed_keywords) + if analyzed_keywords: + for kw in analyzed_keywords: + keywords.add(kw) + log.debug("Keywords found : %s", keywords) + + keywords = list(keywords) url_obj, created = AnalyzedUrl.objects.get_or_create( url=normalized_url, publisher=publisher, diff --git a/adserver/analyzer/utils.py b/adserver/analyzer/utils.py index 9efe03e5..a4b48b8d 100644 --- a/adserver/analyzer/utils.py +++ b/adserver/analyzer/utils.py @@ -7,8 +7,18 @@ from .constants import IGNORED_QUERY_PARAMS +def get_url_analyzer_backends(): + for backend in settings.ADSERVER_ANALYZER_BACKEND: + if backend: + yield import_string(backend) + + def get_url_analyzer_backend(): - return import_string(settings.ADSERVER_ANALYZER_BACKEND) + backends = list(get_url_analyzer_backends()) + if backends: + return backends[0] + + return None def normalize_url(url): diff --git a/config/settings/base.py b/config/settings/base.py index 3aa4be02..df95cd5c 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -487,12 +487,12 @@ default="adserver.decisionengine.backends.ProbabilisticFlightBackend", ) -# The backend to be used by the ad server +# The backend(s) to be used by the ad server # for topic and keyword analysis -# Set to `None` to disable the analyzer entirely -ADSERVER_ANALYZER_BACKEND = env( +# Set to `None` or an empty string to disable the analyzer entirely +ADSERVER_ANALYZER_BACKEND = env.list( "ADSERVER_ANALYZER_BACKEND", - default="adserver.analyzer.backends.TextacyAnalyzerBackend", + default=["adserver.analyzer.backends.TextacyAnalyzerBackend"], ) if ADSERVER_ANALYZER_BACKEND: INSTALLED_APPS.append("adserver.analyzer")