Merge pull request #45 from Mews/respect-robots-txt

Feature: Respect robots txt
DataCrawl-AI · Jun 19, 2024 · 6f5ea34 · 6f5ea34
2 parents b4bf3db + 83e30ef
commit 6f5ea34
Show file tree

Hide file tree

Showing 6 changed files with 358 additions and 46 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -288,7 +288,7 @@ ignored-parents=
 max-args=10
 
 # Maximum number of attributes for a class (see R0902).
-max-attributes=15
+max-attributes=17
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr=5

diff --git a/src/tiny_web_crawler/core/spider.py b/src/tiny_web_crawler/core/spider.py
@@ -8,10 +8,13 @@
 from typing import Dict, List, Optional, Set, Any
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import urllib.parse
+import urllib.robotparser
+import requests
 
 from tiny_web_crawler.networking.fetcher import fetch_url
 from tiny_web_crawler.networking.validator import is_valid_url
 from tiny_web_crawler.networking.formatter import format_url
+from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url
 from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG
 
 DEFAULT_SCHEME: str = 'http://'
@@ -35,6 +38,7 @@ class Spider:
         include_body (bool): Whether or not to include the crawled page's body in crawl_result (default: False)
         internal_links_only (bool): Whether or not to crawl only internal links
         external_links_only (bool): Whether or not to crawl only external links
+        respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
     """
 
     root_url: str
@@ -51,11 +55,14 @@ class Spider:
     include_body: bool = False
     internal_links_only: bool = False
     external_links_only: bool = False
+    respect_robots_txt: bool = True
 
     def __post_init__(self) -> None:
-        self.scheme = DEFAULT_SCHEME
+        self.scheme: str = DEFAULT_SCHEME
 
-        self.root_netloc = urllib.parse.urlparse(self.root_url).netloc
+        self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}
+
+        self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc
 
         if self.internal_links_only and self.external_links_only:
             raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
@@ -65,6 +72,15 @@ def __post_init__(self) -> None:
         else:
             set_logging_level(INFO)
 
+        if not self.respect_robots_txt:
+            logger.warning(
+                "Ignoring robots.txt files! You might be at risk of:\n"+
+                "Agent/IP bans;\n"+
+                "Disrupted operation;\n"+
+                "Increased suspicion from anti-bot services;\n"+
+                "Potential legal action;"
+            )
+
     def save_results(self) -> None:
         """
         Saves the crawl results into a JSON file.
@@ -88,6 +104,10 @@ def crawl(self, url: str) -> None:
             logger.debug("URL already crawled: %s", url)
             return
 
+        if self.respect_robots_txt and not self._handle_robots_txt(url):
+            logger.debug("Skipped: Url doesn't allow crawling: %s", url)
+            return
+
         logger.debug("Crawling: %s", url)
         soup = fetch_url(url)
         if not soup:
@@ -101,24 +121,8 @@ def crawl(self, url: str) -> None:
 
         for link in links:
             pretty_url = format_url(link['href'].lstrip(), url, self.scheme)
-            if not is_valid_url(pretty_url):
-                logger.debug("Invalid url: %s", pretty_url)
-                continue
-
-            if pretty_url in self.crawl_result[url]['urls']:
-                continue
-
-            if self.url_regex:
-                if not re.compile(self.url_regex).match(pretty_url):
-                    logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
-                    continue
-
-            if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
-                logger.debug("Skipping: External link: %s", pretty_url)
-                continue
 
-            if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
-                logger.debug("Skipping: Internal link: %s", pretty_url)
+            if self._should_skip_link(pretty_url, url):
                 continue
 
             self.crawl_result[url]['urls'].append(pretty_url)
@@ -129,6 +133,48 @@ def crawl(self, url: str) -> None:
             self.link_count += 1
             logger.debug("Links crawled: %s", self.link_count)
 
+    def _should_skip_link(self, pretty_url: str, url: str) -> bool:
+        if not is_valid_url(pretty_url):
+            logger.debug("Invalid url: %s", pretty_url)
+            return True
+
+        if pretty_url in self.crawl_result[url]['urls']:
+            return True
+
+        if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
+            logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
+            return True
+
+        if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
+            logger.debug("Skipping: External link: %s", pretty_url)
+            return True
+
+        if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
+            logger.debug("Skipping: Internal link: %s", pretty_url)
+            return True
+
+        return False
+
+    def _handle_robots_txt(self, url: str) -> bool:
+        user_agent = requests.utils.default_user_agent()
+        robots_url = get_robots_txt_url(url)
+
+        if robots_url in self.robots:
+            robot_parser = self.robots[robots_url]
+        else:
+            robot_parser = setup_robots_txt_parser(robots_url)
+
+            self.robots[robots_url] = robot_parser
+
+        if not is_robots_txt_allowed(url, robot_parser):
+            return False
+
+        crawl_delay = robot_parser.crawl_delay(user_agent)
+        if crawl_delay is not None:
+            time.sleep(float(crawl_delay))
+
+        return True
+
     def start(self) -> Dict[str, Dict[str, List[str]]]:
         """
         Starts the crawling process from the root URL. Crawls up to max_links URLs.

diff --git a/src/tiny_web_crawler/networking/robots_txt.py b/src/tiny_web_crawler/networking/robots_txt.py
@@ -0,0 +1,56 @@
+import urllib.parse
+import urllib.robotparser
+from typing import Optional
+import requests
+
+def get_robots_txt_url(url: str) -> str:
+    """
+    Returns a url to a robots.txt file from the provided url.
+
+    Args:
+        url (str): The URL to get the robots.txt of.
+
+    Returns:
+        str: The robots.txt url.
+    """
+
+    parsed_url = urllib.parse.urlparse(url)
+
+    return parsed_url.scheme + "://"+ parsed_url.netloc + "/robots.txt"
+
+
+def is_robots_txt_allowed(url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None) -> bool:
+    """
+    Checks if the provided URL can be crawled, according to its corresponding robots.txt file
+
+    Args:
+        url (str): The URL to check.
+
+    Returns:
+        bool: True if the URL can be crawled, False otherwise.
+    """
+
+    user_agent = requests.utils.default_user_agent()
+
+    if robot_parser is None:
+        robot_parser = setup_robots_txt_parser(url)
+
+    return robot_parser.can_fetch(user_agent, url)
+
+
+def setup_robots_txt_parser(robots_txt_url: str) -> urllib.robotparser.RobotFileParser:
+    """
+    Creates a RobotFileParser object from the given url to a robots.txt file
+
+    Args:
+        robot_txt_url (str): The URL to the robots.txt file.
+
+    Returns:
+        urllib.robotparser.RobotFileParser: The RobotFileParser object with the url already read.
+    """
+
+    robot_parser = urllib.robotparser.RobotFileParser()
+    robot_parser.set_url(robots_txt_url)
+    robot_parser.read()
+
+    return robot_parser
diff --git a/tests/networking/test_formatter.py b/tests/networking/test_formatter.py
@@ -1,27 +1,16 @@
-from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME
-
-def test_format_url() -> None:
-    assert (
-        format_url("/test", "http://example.com")
-        == "http://example.com/test"
-    )
-
-    assert (
-        format_url("http://example.com/test", "http://example.com")
-        == "http://example.com/test"
-    )
+import pytest
 
-    assert (
-        format_url('path1/path2', 'http://example.com')
-        == 'http://example.com/path1/path2'
-    )
-
-    assert (
-        format_url('/path1/path2', 'http://example.com')
-        == 'http://example.com/path1/path2'
-    )
+from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME
 
-    assert (
-        format_url('path.com', 'http://example.com')
-        == DEFAULT_SCHEME + 'path.com'
-    )
+@pytest.mark.parametrize(
+    "url, base_url, expected",
+    [
+        ("/test", "http://example.com", "http://example.com/test"),
+        ("http://example.com/test", "http://example.com", "http://example.com/test"),
+        ("path1/path2", "http://example.com", "http://example.com/path1/path2"),
+        ("/path1/path2", "http://example.com", "http://example.com/path1/path2"),
+        ("path.com", "http://example.com", f"{DEFAULT_SCHEME}path.com"),
+    ]
+)
+def test_format_url(url: str, base_url: str, expected: str) -> None:
+    assert format_url(url, base_url) == expected
diff --git a/tests/networking/test_robots_txt.py b/tests/networking/test_robots_txt.py
@@ -0,0 +1,62 @@
+from unittest.mock import patch, MagicMock
+from io import BytesIO
+import urllib.robotparser
+
+import pytest
+
+from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser
+
+@pytest.mark.parametrize(
+    "url, expected",
+    [
+        ("http://example", "http://example/robots.txt"),
+        ("http://example/path", "http://example/robots.txt"),
+        ("https://example/", "https://example/robots.txt"),
+        ("http://example/path1/path2/path3/path4", "http://example/robots.txt"),
+        ("http://example/path#fragment", "http://example/robots.txt"),
+        ("http://example/path?query=test", "http://example/robots.txt"),
+    ]
+)
+def test_get_robots_txt_url(url: str, expected: str) -> None:
+    assert get_robots_txt_url(url) == expected
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nAllow: /"
+    mock_urlopen.return_value = BytesIO(mock_response)
+
+    assert is_robots_txt_allowed("http://example.com")
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nDisallow: /"
+    mock_urlopen.return_value = BytesIO(mock_response)
+
+    assert not is_robots_txt_allowed("http://example.com")
+
+
+@patch('urllib.request.urlopen')
+def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None:
+    # Mock the response content of robots.txt
+    mock_response = b"User-agent: *\nDisallow: /private"
+
+    mock_urlopen.return_value = BytesIO(mock_response)
+    assert is_robots_txt_allowed("http://example.com")
+
+    mock_urlopen.return_value = BytesIO(mock_response)
+    assert not is_robots_txt_allowed("http://example.com/private")
+
+
+def test_is_robots_txt_allowed_no_robots_txt() -> None:
+    # Check that websites with no robots.txt are set as crawlable
+    assert is_robots_txt_allowed("http://example.com")
+
+
+def test_setup_robots_txt_parser() -> None:
+    robot_parser = setup_robots_txt_parser("http://example.com")
+
+    assert isinstance(robot_parser, urllib.robotparser.RobotFileParser)