Skip to content

Commit

Permalink
Retry getting driver.current_url w/ fallback
Browse files Browse the repository at this point in the history
Following up on 5211f67
  • Loading branch information
ghostwords committed Jan 6, 2025
1 parent 218b3dd commit ace90cd
Showing 1 changed file with 18 additions and 9 deletions.
27 changes: 18 additions & 9 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,16 @@ def log_snitch_map_changes(self, old_snitches, new_snitches):
if diff:
self.logger.info("New domains in snitch_map: %s", ', '.join(sorted(diff)))

def get_current_url(self):
for i in range(3):
try:
return self.driver.current_url
except TimeoutException:
time.sleep(2 + i)

return None


def crawl(self):
"""
Visit the top `num_sites` websites in the Tranco list, in order, in
Expand Down Expand Up @@ -1038,24 +1048,23 @@ def crawl(self):
# load the next domain
self.logger.info("Visiting %d: %s", i + 1, domain)
self.visit_domain(domain)
self.logger.info("Visited %s", self.driver.current_url)
self.logger.info("Visited %s", self.get_current_url() or domain)
num_visited += 1

except (MaxRetryError, ProtocolError, ReadTimeoutError) as ex:
self.logger.warning("Error loading %s:\n%s",
self.driver.current_url, str(ex))
self.get_current_url() or domain,
str(ex))
self.restart_browser()

except TimeoutException as ex:
# TODO driver.current_url here can raise a "Timed out
# TODO receiving message from renderer" TimeoutException
# TODO when we run into same exception in visit_domain()
self.logger.warning("Timed out loading %s (%s)",
domain, str(ex))
except TimeoutException:
self.logger.warning("Timed out loading %s",
self.get_current_url() or domain)

except WebDriverException as ex:
self.logger.error("%s on %s: %s", type(ex).__name__,
self.driver.current_url, ex.msg)
self.get_current_url() or domain,
ex.msg)
if should_restart(ex):
self.restart_browser()

Expand Down

0 comments on commit ace90cd

Please sign in to comment.