From a014b00c0271513bdeb2e94563ad252b323a9a12 Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 19:55:54 +0100 Subject: [PATCH 1/7] Remove transient error code in `test_fetch_url_http_error` --- tests/networking/test_fetcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/networking/test_fetcher.py b/tests/networking/test_fetcher.py index 61e52ed..a1a1fd2 100644 --- a/tests/networking/test_fetcher.py +++ b/tests/networking/test_fetcher.py @@ -34,7 +34,7 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore @responses.activate def test_fetch_url_http_error(caplog) -> None: # type: ignore - error_codes = [403, 404, 408] + error_codes = [403, 404, 412] for error_code in error_codes: setup_mock_response( From b27d5cf3fd5969a4bb94455e94fbf77b31c6dd99 Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 20:43:35 +0100 Subject: [PATCH 2/7] Retry up to 5 times when encountering a transient error --- src/tiny_web_crawler/networking/fetcher.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/tiny_web_crawler/networking/fetcher.py b/src/tiny_web_crawler/networking/fetcher.py index e592277..d83ef81 100644 --- a/src/tiny_web_crawler/networking/fetcher.py +++ b/src/tiny_web_crawler/networking/fetcher.py @@ -1,20 +1,34 @@ from typing import Optional +import time import requests from bs4 import BeautifulSoup from tiny_web_crawler.logging import get_logger +MAX_RETRIES = 5 + logger = get_logger() -def fetch_url(url: str) -> Optional[BeautifulSoup]: +def is_transient_error(status_code: int) -> bool: + transient_errors = [408, 502, 503, 504] + + return status_code in transient_errors + +def fetch_url(url: str, retries: int = MAX_RETRIES) -> Optional[BeautifulSoup]: try: response = requests.get(url, timeout=10) response.raise_for_status() data = response.text return BeautifulSoup(data, 'lxml') except requests.exceptions.HTTPError as http_err: + if response.status_code and is_transient_error(response.status_code) and retries > 1: + logger.error("Transient HTTP error occurred: %s. Retrying...", http_err) + time.sleep( 1*(MAX_RETRIES-retries+1) ) + return fetch_url( url, retries-1 ) + logger.error("HTTP error occurred: %s", http_err) + return None except requests.exceptions.ConnectionError as conn_err: logger.error("Connection error occurred: %s", conn_err) except requests.exceptions.Timeout as timeout_err: From da1ce7820d76a96fc8d71a2988fb8a69ee0dcc07 Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 20:45:38 +0100 Subject: [PATCH 3/7] Test transient error retrying --- tests/networking/test_fetcher.py | 53 ++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/networking/test_fetcher.py b/tests/networking/test_fetcher.py index a1a1fd2..a0ecb3f 100644 --- a/tests/networking/test_fetcher.py +++ b/tests/networking/test_fetcher.py @@ -1,3 +1,4 @@ +from unittest.mock import patch import responses import requests @@ -80,3 +81,55 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore assert "Request error occurred:" in caplog.text assert resp is None + +@patch("time.sleep") +@responses.activate +def test_fetch_url_transient_error_retry(mock_sleep, caplog) -> None: # type: ignore + setup_mock_response( + url="http://transient.error", + body="
link", + status=503 + ) + + with caplog.at_level(ERROR): + resp = fetch_url("http://transient.error") + + assert resp is None + + # Assert url was fetched 5 times + assert len(responses.calls) == 5 + + # Assert sleep time grew with every request + expected_delays = [1, 2, 3, 4] + actual_delays = [call.args[0] for call in mock_sleep.call_args_list] + assert actual_delays == expected_delays + + assert "Transient HTTP error occurred:" in caplog.text + +@patch("time.sleep") +@responses.activate +def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore + setup_mock_response( + url="http://transient.error", + body="link", + status=503 + ) + setup_mock_response( + url="http://transient.error", + body="link", + status=200 + ) + + with caplog.at_level(ERROR): + resp = fetch_url("http://transient.error") + + assert resp is not None + assert resp.text == "link" + + # Assert url was fetched 2 times + assert len(responses.calls) == 2 + + # Assert time.sleep was called + mock_sleep.assert_called_once_with(1) + + assert "Transient HTTP error occurred:" in caplog.text From 66c629d8531691c0316c03199c16abb84a71d4a9 Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 20:50:58 +0100 Subject: [PATCH 4/7] Moved `TRANSIENT_ERRORS` global variable outside of function --- src/tiny_web_crawler/networking/fetcher.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tiny_web_crawler/networking/fetcher.py b/src/tiny_web_crawler/networking/fetcher.py index d83ef81..e54acfe 100644 --- a/src/tiny_web_crawler/networking/fetcher.py +++ b/src/tiny_web_crawler/networking/fetcher.py @@ -7,13 +7,12 @@ from tiny_web_crawler.logging import get_logger MAX_RETRIES = 5 +TRANSIENT_ERRORS = [408, 502, 503, 504] logger = get_logger() def is_transient_error(status_code: int) -> bool: - transient_errors = [408, 502, 503, 504] - - return status_code in transient_errors + return status_code in TRANSIENT_ERRORS def fetch_url(url: str, retries: int = MAX_RETRIES) -> Optional[BeautifulSoup]: try: From db52f2280f520d3790d8e83c8d992843f5633d1f Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 23:37:25 +0100 Subject: [PATCH 5/7] Made max retry attempts configurable --- src/tiny_web_crawler/core/spider.py | 2 +- src/tiny_web_crawler/core/spider_settings.py | 1 + src/tiny_web_crawler/networking/fetcher.py | 9 ++++---- tests/networking/test_fetcher.py | 24 ++++++++++++-------- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py index 9af0af4..8d8b040 100644 --- a/src/tiny_web_crawler/core/spider.py +++ b/src/tiny_web_crawler/core/spider.py @@ -85,7 +85,7 @@ def crawl(self, url: str) -> None: return logger.debug("Crawling: %s", url) - soup = fetch_url(url) + soup = fetch_url(url, retries=self.settings.max_retry_attempts) if not soup: return diff --git a/src/tiny_web_crawler/core/spider_settings.py b/src/tiny_web_crawler/core/spider_settings.py index 7d383dd..5bd2f21 100644 --- a/src/tiny_web_crawler/core/spider_settings.py +++ b/src/tiny_web_crawler/core/spider_settings.py @@ -40,6 +40,7 @@ class CrawlSettings: internal_links_only: bool = False external_links_only: bool = False respect_robots_txt: bool = True + max_retry_attempts: int = 5 @dataclass class SpiderSettings(GeneralSettings, CrawlSettings): diff --git a/src/tiny_web_crawler/networking/fetcher.py b/src/tiny_web_crawler/networking/fetcher.py index e54acfe..38addb6 100644 --- a/src/tiny_web_crawler/networking/fetcher.py +++ b/src/tiny_web_crawler/networking/fetcher.py @@ -6,7 +6,6 @@ from tiny_web_crawler.logging import get_logger -MAX_RETRIES = 5 TRANSIENT_ERRORS = [408, 502, 503, 504] logger = get_logger() @@ -14,17 +13,17 @@ def is_transient_error(status_code: int) -> bool: return status_code in TRANSIENT_ERRORS -def fetch_url(url: str, retries: int = MAX_RETRIES) -> Optional[BeautifulSoup]: +def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSoup]: try: response = requests.get(url, timeout=10) response.raise_for_status() data = response.text return BeautifulSoup(data, 'lxml') except requests.exceptions.HTTPError as http_err: - if response.status_code and is_transient_error(response.status_code) and retries > 1: + if response.status_code and is_transient_error(response.status_code) and retries > 0: logger.error("Transient HTTP error occurred: %s. Retrying...", http_err) - time.sleep( 1*(MAX_RETRIES-retries+1) ) - return fetch_url( url, retries-1 ) + time.sleep( attempts+1 ) + return fetch_url( url, retries-1 , attempts+1) logger.error("HTTP error occurred: %s", http_err) return None diff --git a/tests/networking/test_fetcher.py b/tests/networking/test_fetcher.py index a0ecb3f..69c6aa0 100644 --- a/tests/networking/test_fetcher.py +++ b/tests/networking/test_fetcher.py @@ -16,7 +16,7 @@ def test_fetch_url() -> None: status=200 ) - resp = fetch_url("http://example.com") + resp = fetch_url("http://example.com", 1) assert resp is not None assert resp.text == "link" @@ -27,7 +27,7 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore with caplog.at_level(ERROR): # Fetch url whose response isn't mocked to raise ConnectionError - resp = fetch_url("http://connection.error") + resp = fetch_url("http://connection.error", 1) assert "Connection error occurred:" in caplog.text assert resp is None @@ -45,7 +45,7 @@ def test_fetch_url_http_error(caplog) -> None: # type: ignore ) with caplog.at_level(ERROR): - resp = fetch_url(f"http://http.error/{error_code}") + resp = fetch_url(f"http://http.error/{error_code}", 1) assert "HTTP error occurred:" in caplog.text assert resp is None @@ -61,7 +61,7 @@ def test_fetch_url_timeout_error(caplog) -> None: # type: ignore with caplog.at_level(ERROR): # Fetch url whose response isn't mocked to raise ConnectionError - resp = fetch_url("http://timeout.error") + resp = fetch_url("http://timeout.error", 1) assert "Timeout error occurred:" in caplog.text assert resp is None @@ -77,7 +77,7 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore with caplog.at_level(ERROR): # Fetch url whose response isn't mocked to raise ConnectionError - resp = fetch_url("http://requests.exception") + resp = fetch_url("http://requests.exception", 1) assert "Request error occurred:" in caplog.text assert resp is None @@ -91,16 +91,18 @@ def test_fetch_url_transient_error_retry(mock_sleep, caplog) -> None: # type: ig status=503 ) + max_retry_attempts = 5 + with caplog.at_level(ERROR): - resp = fetch_url("http://transient.error") + resp = fetch_url("http://transient.error", max_retry_attempts) assert resp is None - # Assert url was fetched 5 times - assert len(responses.calls) == 5 + # Assert url was fetched once then retried x ammount of times + assert len(responses.calls) == max_retry_attempts + 1 # Assert sleep time grew with every request - expected_delays = [1, 2, 3, 4] + expected_delays = [1, 2, 3, 4, 5] actual_delays = [call.args[0] for call in mock_sleep.call_args_list] assert actual_delays == expected_delays @@ -120,8 +122,10 @@ def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # status=200 ) + max_retry_attempts = 1 + with caplog.at_level(ERROR): - resp = fetch_url("http://transient.error") + resp = fetch_url("http://transient.error", max_retry_attempts) assert resp is not None assert resp.text == "link" From 49029a23d0fb07147ba0f3501d972c8900b91ae7 Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 23:40:04 +0100 Subject: [PATCH 6/7] Add test for custom amount of retry attempts --- tests/networking/test_fetcher.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/networking/test_fetcher.py b/tests/networking/test_fetcher.py index 69c6aa0..e609c1b 100644 --- a/tests/networking/test_fetcher.py +++ b/tests/networking/test_fetcher.py @@ -82,9 +82,10 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore assert "Request error occurred:" in caplog.text assert resp is None + @patch("time.sleep") @responses.activate -def test_fetch_url_transient_error_retry(mock_sleep, caplog) -> None: # type: ignore +def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore setup_mock_response( url="http://transient.error", body="link", @@ -108,6 +109,34 @@ def test_fetch_url_transient_error_retry(mock_sleep, caplog) -> None: # type: ig assert "Transient HTTP error occurred:" in caplog.text + +@patch("time.sleep") +@responses.activate +def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore + setup_mock_response( + url="http://transient.error", + body="link", + status=503 + ) + + max_retry_attempts = 10 + + with caplog.at_level(ERROR): + resp = fetch_url("http://transient.error", max_retry_attempts) + + assert resp is None + + # Assert url was fetched once then retried x ammount of times + assert len(responses.calls) == max_retry_attempts + 1 + + # Assert sleep time grew with every request + expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + actual_delays = [call.args[0] for call in mock_sleep.call_args_list] + assert actual_delays == expected_delays + + assert "Transient HTTP error occurred:" in caplog.text + + @patch("time.sleep") @responses.activate def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore From 1fe33faf65fc7bba6eb3e22bf62063cb4b52e0af Mon Sep 17 00:00:00 2001 From: Mews <60406199+Mews@users.noreply.github.com> Date: Sat, 29 Jun 2024 23:51:51 +0100 Subject: [PATCH 7/7] Add tests for transient error retrying in crawl method --- tests/test_crawler.py | 59 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 6420e49..7acf0d2 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -8,7 +8,7 @@ from tiny_web_crawler import Spider from tiny_web_crawler import SpiderSettings -from tiny_web_crawler.logging import DEBUG, WARNING +from tiny_web_crawler.logging import DEBUG, WARNING, ERROR from tests.utils import setup_mock_response @responses.activate @@ -490,3 +490,60 @@ def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> Non def test_crawl_no_root_url() -> None: with pytest.raises(ValueError): Spider(SpiderSettings(verbose=False)) + + +@patch("time.sleep") +@responses.activate +def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore + setup_mock_response( + url="http://transient.error", + body="link", + status=503 + ) + + spider = Spider( + SpiderSettings(root_url="http://transient.error", + respect_robots_txt=False) + ) + + with caplog.at_level(ERROR): + spider.crawl("http://transient.error") + + assert spider.crawl_result == {} + + assert len(responses.calls) == 6 + + expected_delays = [1, 2, 3, 4, 5] + actual_delays = [call.args[0] for call in mock_sleep.call_args_list] + assert actual_delays == expected_delays + + assert "Transient HTTP error occurred:" in caplog.text + + +@patch("time.sleep") +@responses.activate +def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore + setup_mock_response( + url="http://transient.error", + body="link", + status=503 + ) + + spider = Spider( + SpiderSettings(root_url="http://transient.error", + max_retry_attempts=10, + respect_robots_txt=False) + ) + + with caplog.at_level(ERROR): + spider.crawl("http://transient.error") + + assert spider.crawl_result == {} + + assert len(responses.calls) == 11 + + expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + actual_delays = [call.args[0] for call in mock_sleep.call_args_list] + assert actual_delays == expected_delays + + assert "Transient HTTP error occurred:" in caplog.text