Skip to content

Commit

Permalink
Refactor backends to only fetch once
Browse files Browse the repository at this point in the history
  • Loading branch information
ericholscher committed Feb 9, 2024
1 parent d920e15 commit 59292f4
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 23 deletions.
35 changes: 16 additions & 19 deletions adserver/analyzer/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,54 +62,52 @@ def fetch(self, **kwargs):
kwargs.setdefault("headers", {"user-agent": self.user_agent})

try:
self.resp = requests.get(self.url, **kwargs)
return self.resp
return requests.get(self.url, **kwargs)
except (requests.exceptions.RequestException, urllib3.exceptions.HTTPError):
log.info("Error analyzing URL: %s", self.url, exc_info=True)

return None

def analyze(self):
def analyze(self, response):
"""
Fetch the response and parse it for keywords.
Parse response for keywords.
:returns list: a list of keywords or `None` if the URL doesn't respond.
"""
self.fetch()

if self.resp and self.resp.ok:
return self.analyze_response(self.resp)
if response and response.ok:
return self.analyze_response(response)

if not self.resp:
if not response:
log.debug("Failed to connect. Url=%s", self.url)
else:
log.debug(
"Failed to connect. Url=%s, Status=%s", self.url, self.resp.status_code
"Failed to connect. Url=%s, Status=%s", self.url, response.status_code
)

# A failed request results in `None`.
return None

def embedding(self):
def embedding(self, response):
"""
Parse the response for embeddings.
:returns vector: A 384-dimensional vector or `None` if the URL doesn't respond.
"""

if self.resp and self.resp.ok:
return self.embed_response(self.resp)
if response and response.ok:
return self.embed_response(response)

if not self.resp:
if not response:
log.debug("Failed to connect. Url=%s", self.url)
else:
log.debug(
"Failed to connect. Url=%s, Status=%s", self.url, self.resp.status_code
"Failed to connect. Url=%s, Status=%s", self.url, response.status_code
)

return None

def analyze_response(self, resp):
def analyze_response(self, response):
"""
Analyze an HTTP response and return keywords/topics for the URL.
Expand All @@ -120,13 +118,12 @@ def analyze_response(self, resp):
"""
raise NotImplementedError("Subclasses should define this.")

def embed_response(self, resp):
def embed_response(self, response):
"""
Analyze an HTTP response and return an embedding for the URL.
This will only be passed a successful response (20x).
All responses should return a vector even if that list is empty.
This needs to be defined by subclasses.
"""
raise NotImplementedError("Subclasses should define this.")
log.warning("No embedding configured for %s", self.__class__.__name__)
return []
5 changes: 3 additions & 2 deletions adserver/analyzer/management/commands/runmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ def handle_url(self, url):
embeddings = []
for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze()
response = backend_instance.fetch()
analyzed_keywords = backend_instance.analyze(response)
self.stdout.write(
_("Keywords from '%s': %s") % (backend.__name__, analyzed_keywords)
)
analyzed_embedding = backend.embedding()
analyzed_embedding = backend.embedding(response)
self.stdout.write(
_("Embeddings from '%s': %s") % (backend.__name__, analyzed_embedding)
)
Expand Down
9 changes: 7 additions & 2 deletions adserver/analyzer/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,18 @@ def analyze_url(url, publisher_slug, force=False):
log.debug("Analyzing url: %s", normalized_url)
keywords = set()
embeddings = []
response = None

for backend in get_url_analyzer_backends():
backend_instance = backend(url)
analyzed_keywords = backend_instance.analyze() # Can be None
# Cache responses across backends
if not response:
response = backend_instance.fetch()

analyzed_keywords = backend_instance.analyze(response) # Can be None
log.debug("Keywords from '%s': %s", backend.__name__, analyzed_keywords)

analyzed_embedding = backend_instance.embedding() # Can be None
analyzed_embedding = backend_instance.embedding(response) # Can be None
log.debug("Embedding from '%s': %s", backend.__name__, analyzed_embedding)

if analyzed_keywords:
Expand Down

0 comments on commit 59292f4

Please sign in to comment.