diff --git a/.pylintrc b/.pylintrc index 4ab468e..4aa7d95 100644 --- a/.pylintrc +++ b/.pylintrc @@ -288,7 +288,7 @@ ignored-parents= max-args=10 # Maximum number of attributes for a class (see R0902). -max-attributes=15 +max-attributes=17 # Maximum number of boolean expressions in an if statement (see R0916). max-bool-expr=5 diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py index ab8a937..9e5a2ea 100644 --- a/src/tiny_web_crawler/core/spider.py +++ b/src/tiny_web_crawler/core/spider.py @@ -8,10 +8,13 @@ from typing import Dict, List, Optional, Set, Any from concurrent.futures import ThreadPoolExecutor, as_completed import urllib.parse +import urllib.robotparser +import requests from tiny_web_crawler.networking.fetcher import fetch_url from tiny_web_crawler.networking.validator import is_valid_url from tiny_web_crawler.networking.formatter import format_url +from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG DEFAULT_SCHEME: str = 'http://' @@ -35,6 +38,7 @@ class Spider: include_body (bool): Whether or not to include the crawled page's body in crawl_result (default: False) internal_links_only (bool): Whether or not to crawl only internal links external_links_only (bool): Whether or not to crawl only external links + respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True) """ root_url: str @@ -51,11 +55,14 @@ class Spider: include_body: bool = False internal_links_only: bool = False external_links_only: bool = False + respect_robots_txt: bool = True def __post_init__(self) -> None: - self.scheme = DEFAULT_SCHEME + self.scheme: str = DEFAULT_SCHEME - self.root_netloc = urllib.parse.urlparse(self.root_url).netloc + self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {} + + self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc if self.internal_links_only and self.external_links_only: raise ValueError("Only one of internal_links_only and external_links_only can be set to True") @@ -65,6 +72,15 @@ def __post_init__(self) -> None: else: set_logging_level(INFO) + if not self.respect_robots_txt: + logger.warning( + "Ignoring robots.txt files! You might be at risk of:\n"+ + "Agent/IP bans;\n"+ + "Disrupted operation;\n"+ + "Increased suspicion from anti-bot services;\n"+ + "Potential legal action;" + ) + def save_results(self) -> None: """ Saves the crawl results into a JSON file. @@ -88,6 +104,10 @@ def crawl(self, url: str) -> None: logger.debug("URL already crawled: %s", url) return + if self.respect_robots_txt and not self._handle_robots_txt(url): + logger.debug("Skipped: Url doesn't allow crawling: %s", url) + return + logger.debug("Crawling: %s", url) soup = fetch_url(url) if not soup: @@ -101,24 +121,8 @@ def crawl(self, url: str) -> None: for link in links: pretty_url = format_url(link['href'].lstrip(), url, self.scheme) - if not is_valid_url(pretty_url): - logger.debug("Invalid url: %s", pretty_url) - continue - - if pretty_url in self.crawl_result[url]['urls']: - continue - - if self.url_regex: - if not re.compile(self.url_regex).match(pretty_url): - logger.debug("Skipping: URL didn't match regex: %s", pretty_url) - continue - - if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc: - logger.debug("Skipping: External link: %s", pretty_url) - continue - if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc: - logger.debug("Skipping: Internal link: %s", pretty_url) + if self._should_skip_link(pretty_url, url): continue self.crawl_result[url]['urls'].append(pretty_url) @@ -129,6 +133,48 @@ def crawl(self, url: str) -> None: self.link_count += 1 logger.debug("Links crawled: %s", self.link_count) + def _should_skip_link(self, pretty_url: str, url: str) -> bool: + if not is_valid_url(pretty_url): + logger.debug("Invalid url: %s", pretty_url) + return True + + if pretty_url in self.crawl_result[url]['urls']: + return True + + if self.url_regex and not re.compile(self.url_regex).match(pretty_url): + logger.debug("Skipping: URL didn't match regex: %s", pretty_url) + return True + + if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc: + logger.debug("Skipping: External link: %s", pretty_url) + return True + + if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc: + logger.debug("Skipping: Internal link: %s", pretty_url) + return True + + return False + + def _handle_robots_txt(self, url: str) -> bool: + user_agent = requests.utils.default_user_agent() + robots_url = get_robots_txt_url(url) + + if robots_url in self.robots: + robot_parser = self.robots[robots_url] + else: + robot_parser = setup_robots_txt_parser(robots_url) + + self.robots[robots_url] = robot_parser + + if not is_robots_txt_allowed(url, robot_parser): + return False + + crawl_delay = robot_parser.crawl_delay(user_agent) + if crawl_delay is not None: + time.sleep(float(crawl_delay)) + + return True + def start(self) -> Dict[str, Dict[str, List[str]]]: """ Starts the crawling process from the root URL. Crawls up to max_links URLs. diff --git a/src/tiny_web_crawler/networking/robots_txt.py b/src/tiny_web_crawler/networking/robots_txt.py new file mode 100644 index 0000000..fd24d88 --- /dev/null +++ b/src/tiny_web_crawler/networking/robots_txt.py @@ -0,0 +1,56 @@ +import urllib.parse +import urllib.robotparser +from typing import Optional +import requests + +def get_robots_txt_url(url: str) -> str: + """ + Returns a url to a robots.txt file from the provided url. + + Args: + url (str): The URL to get the robots.txt of. + + Returns: + str: The robots.txt url. + """ + + parsed_url = urllib.parse.urlparse(url) + + return parsed_url.scheme + "://"+ parsed_url.netloc + "/robots.txt" + + +def is_robots_txt_allowed(url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None) -> bool: + """ + Checks if the provided URL can be crawled, according to its corresponding robots.txt file + + Args: + url (str): The URL to check. + + Returns: + bool: True if the URL can be crawled, False otherwise. + """ + + user_agent = requests.utils.default_user_agent() + + if robot_parser is None: + robot_parser = setup_robots_txt_parser(url) + + return robot_parser.can_fetch(user_agent, url) + + +def setup_robots_txt_parser(robots_txt_url: str) -> urllib.robotparser.RobotFileParser: + """ + Creates a RobotFileParser object from the given url to a robots.txt file + + Args: + robot_txt_url (str): The URL to the robots.txt file. + + Returns: + urllib.robotparser.RobotFileParser: The RobotFileParser object with the url already read. + """ + + robot_parser = urllib.robotparser.RobotFileParser() + robot_parser.set_url(robots_txt_url) + robot_parser.read() + + return robot_parser diff --git a/tests/networking/test_formatter.py b/tests/networking/test_formatter.py index 3e1fb9f..9d17fef 100644 --- a/tests/networking/test_formatter.py +++ b/tests/networking/test_formatter.py @@ -1,27 +1,16 @@ -from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME - -def test_format_url() -> None: - assert ( - format_url("/test", "http://example.com") - == "http://example.com/test" - ) - - assert ( - format_url("http://example.com/test", "http://example.com") - == "http://example.com/test" - ) +import pytest - assert ( - format_url('path1/path2', 'http://example.com') - == 'http://example.com/path1/path2' - ) - - assert ( - format_url('/path1/path2', 'http://example.com') - == 'http://example.com/path1/path2' - ) +from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME - assert ( - format_url('path.com', 'http://example.com') - == DEFAULT_SCHEME + 'path.com' - ) +@pytest.mark.parametrize( + "url, base_url, expected", + [ + ("/test", "http://example.com", "http://example.com/test"), + ("http://example.com/test", "http://example.com", "http://example.com/test"), + ("path1/path2", "http://example.com", "http://example.com/path1/path2"), + ("/path1/path2", "http://example.com", "http://example.com/path1/path2"), + ("path.com", "http://example.com", f"{DEFAULT_SCHEME}path.com"), + ] +) +def test_format_url(url: str, base_url: str, expected: str) -> None: + assert format_url(url, base_url) == expected diff --git a/tests/networking/test_robots_txt.py b/tests/networking/test_robots_txt.py new file mode 100644 index 0000000..0feb9ad --- /dev/null +++ b/tests/networking/test_robots_txt.py @@ -0,0 +1,62 @@ +from unittest.mock import patch, MagicMock +from io import BytesIO +import urllib.robotparser + +import pytest + +from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser + +@pytest.mark.parametrize( + "url, expected", + [ + ("http://example", "http://example/robots.txt"), + ("http://example/path", "http://example/robots.txt"), + ("https://example/", "https://example/robots.txt"), + ("http://example/path1/path2/path3/path4", "http://example/robots.txt"), + ("http://example/path#fragment", "http://example/robots.txt"), + ("http://example/path?query=test", "http://example/robots.txt"), + ] +) +def test_get_robots_txt_url(url: str, expected: str) -> None: + assert get_robots_txt_url(url) == expected + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nAllow: /" + mock_urlopen.return_value = BytesIO(mock_response) + + assert is_robots_txt_allowed("http://example.com") + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nDisallow: /" + mock_urlopen.return_value = BytesIO(mock_response) + + assert not is_robots_txt_allowed("http://example.com") + + +@patch('urllib.request.urlopen') +def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None: + # Mock the response content of robots.txt + mock_response = b"User-agent: *\nDisallow: /private" + + mock_urlopen.return_value = BytesIO(mock_response) + assert is_robots_txt_allowed("http://example.com") + + mock_urlopen.return_value = BytesIO(mock_response) + assert not is_robots_txt_allowed("http://example.com/private") + + +def test_is_robots_txt_allowed_no_robots_txt() -> None: + # Check that websites with no robots.txt are set as crawlable + assert is_robots_txt_allowed("http://example.com") + + +def test_setup_robots_txt_parser() -> None: + robot_parser = setup_robots_txt_parser("http://example.com") + + assert isinstance(robot_parser, urllib.robotparser.RobotFileParser) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 54d1828..4b4777b 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -1,11 +1,13 @@ +from io import BytesIO from unittest.mock import MagicMock, mock_open, patch +import urllib.error import responses import pytest from tiny_web_crawler.core.spider import Spider -from tiny_web_crawler.logging import DEBUG +from tiny_web_crawler.logging import DEBUG, WARNING from tests.utils import setup_mock_response @responses.activate @@ -269,3 +271,160 @@ def test_start_with_save_to_file( ] mock_save_results.assert_called_once() + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.start() + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + } + } + + assert "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_allowed(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://crawlable.com") + + assert spider.crawl_result == { + "http://crawlable.com":{ + "urls": ["http://crawlable.com"] + } + } + + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_not_allowed(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://notcrawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://notcrawlable.com") + + assert spider.crawl_result == {} + + assert "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +def test_respect_robots_txt_disabled(mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + setup_mock_response( + url="http://notcrawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /") if url == "http://crawlable.com/robots.txt" else + BytesIO(b"User-agent: *\nDisallow: /") if url == "http://notcrawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + with caplog.at_level(WARNING): + spider = Spider("http://crawlable.com", respect_robots_txt=False) + + assert "Ignoring robots.txt files! You might be at risk of:" in caplog.text + + + with caplog.at_level(DEBUG): + spider.start() + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + }, + "http://notcrawlable.com": { + "urls": ["http://crawlable.com"] + } + } + + assert not "Skipped: Url doesn't allow crawling:" in caplog.text + + assert "http://notcrawlable.com/robots.txt" not in spider.robots + + +@responses.activate +@patch('urllib.request.urlopen') +@patch('time.sleep', return_value=None) +def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> None: # type: ignore + setup_mock_response( + url="http://crawlable.com", + body="link", + status=200 + ) + + mock_urlopen.side_effect = lambda url: ( + BytesIO(b"User-agent: *\nAllow: /\ncrawl-delay: 1") if url == "http://crawlable.com/robots.txt" else + urllib.error.URLError(f"No mock for {url}")) + + spider = Spider("http://crawlable.com", respect_robots_txt=True) + + with caplog.at_level(DEBUG): + spider.crawl("http://crawlable.com") + + assert mock_sleep.call_count == 1 + mock_sleep.assert_called_with(1.0) + + assert spider.crawl_result == { + "http://crawlable.com": { + "urls": ["http://notcrawlable.com"] + } + }