Skip to content

Commit

Permalink
Merge pull request #45 from Mews/respect-robots-txt
Browse files Browse the repository at this point in the history
Feature: Respect robots txt
  • Loading branch information
indrajithi authored Jun 19, 2024
2 parents b4bf3db + 83e30ef commit 6f5ea34
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ ignored-parents=
max-args=10

# Maximum number of attributes for a class (see R0902).
max-attributes=15
max-attributes=17

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
Expand Down
84 changes: 65 additions & 19 deletions src/tiny_web_crawler/core/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
from typing import Dict, List, Optional, Set, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import urllib.parse
import urllib.robotparser
import requests

from tiny_web_crawler.networking.fetcher import fetch_url
from tiny_web_crawler.networking.validator import is_valid_url
from tiny_web_crawler.networking.formatter import format_url
from tiny_web_crawler.networking.robots_txt import is_robots_txt_allowed, setup_robots_txt_parser, get_robots_txt_url
from tiny_web_crawler.logging import get_logger, set_logging_level, INFO, DEBUG

DEFAULT_SCHEME: str = 'http://'
Expand All @@ -35,6 +38,7 @@ class Spider:
include_body (bool): Whether or not to include the crawled page's body in crawl_result (default: False)
internal_links_only (bool): Whether or not to crawl only internal links
external_links_only (bool): Whether or not to crawl only external links
respect_robots_txt (bool): Whether or not to respect website's robots.txt files (defualt: True)
"""

root_url: str
Expand All @@ -51,11 +55,14 @@ class Spider:
include_body: bool = False
internal_links_only: bool = False
external_links_only: bool = False
respect_robots_txt: bool = True

def __post_init__(self) -> None:
self.scheme = DEFAULT_SCHEME
self.scheme: str = DEFAULT_SCHEME

self.root_netloc = urllib.parse.urlparse(self.root_url).netloc
self.robots: Dict[str, urllib.robotparser.RobotFileParser] = {}

self.root_netloc: str = urllib.parse.urlparse(self.root_url).netloc

if self.internal_links_only and self.external_links_only:
raise ValueError("Only one of internal_links_only and external_links_only can be set to True")
Expand All @@ -65,6 +72,15 @@ def __post_init__(self) -> None:
else:
set_logging_level(INFO)

if not self.respect_robots_txt:
logger.warning(
"Ignoring robots.txt files! You might be at risk of:\n"+
"Agent/IP bans;\n"+
"Disrupted operation;\n"+
"Increased suspicion from anti-bot services;\n"+
"Potential legal action;"
)

def save_results(self) -> None:
"""
Saves the crawl results into a JSON file.
Expand All @@ -88,6 +104,10 @@ def crawl(self, url: str) -> None:
logger.debug("URL already crawled: %s", url)
return

if self.respect_robots_txt and not self._handle_robots_txt(url):
logger.debug("Skipped: Url doesn't allow crawling: %s", url)
return

logger.debug("Crawling: %s", url)
soup = fetch_url(url)
if not soup:
Expand All @@ -101,24 +121,8 @@ def crawl(self, url: str) -> None:

for link in links:
pretty_url = format_url(link['href'].lstrip(), url, self.scheme)
if not is_valid_url(pretty_url):
logger.debug("Invalid url: %s", pretty_url)
continue

if pretty_url in self.crawl_result[url]['urls']:
continue

if self.url_regex:
if not re.compile(self.url_regex).match(pretty_url):
logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
continue

if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: External link: %s", pretty_url)
continue

if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: Internal link: %s", pretty_url)
if self._should_skip_link(pretty_url, url):
continue

self.crawl_result[url]['urls'].append(pretty_url)
Expand All @@ -129,6 +133,48 @@ def crawl(self, url: str) -> None:
self.link_count += 1
logger.debug("Links crawled: %s", self.link_count)

def _should_skip_link(self, pretty_url: str, url: str) -> bool:
if not is_valid_url(pretty_url):
logger.debug("Invalid url: %s", pretty_url)
return True

if pretty_url in self.crawl_result[url]['urls']:
return True

if self.url_regex and not re.compile(self.url_regex).match(pretty_url):
logger.debug("Skipping: URL didn't match regex: %s", pretty_url)
return True

if self.internal_links_only and self.root_netloc != urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: External link: %s", pretty_url)
return True

if self.external_links_only and self.root_netloc == urllib.parse.urlparse(pretty_url).netloc:
logger.debug("Skipping: Internal link: %s", pretty_url)
return True

return False

def _handle_robots_txt(self, url: str) -> bool:
user_agent = requests.utils.default_user_agent()
robots_url = get_robots_txt_url(url)

if robots_url in self.robots:
robot_parser = self.robots[robots_url]
else:
robot_parser = setup_robots_txt_parser(robots_url)

self.robots[robots_url] = robot_parser

if not is_robots_txt_allowed(url, robot_parser):
return False

crawl_delay = robot_parser.crawl_delay(user_agent)
if crawl_delay is not None:
time.sleep(float(crawl_delay))

return True

def start(self) -> Dict[str, Dict[str, List[str]]]:
"""
Starts the crawling process from the root URL. Crawls up to max_links URLs.
Expand Down
56 changes: 56 additions & 0 deletions src/tiny_web_crawler/networking/robots_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import urllib.parse
import urllib.robotparser
from typing import Optional
import requests

def get_robots_txt_url(url: str) -> str:
"""
Returns a url to a robots.txt file from the provided url.
Args:
url (str): The URL to get the robots.txt of.
Returns:
str: The robots.txt url.
"""

parsed_url = urllib.parse.urlparse(url)

return parsed_url.scheme + "://"+ parsed_url.netloc + "/robots.txt"


def is_robots_txt_allowed(url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None) -> bool:
"""
Checks if the provided URL can be crawled, according to its corresponding robots.txt file
Args:
url (str): The URL to check.
Returns:
bool: True if the URL can be crawled, False otherwise.
"""

user_agent = requests.utils.default_user_agent()

if robot_parser is None:
robot_parser = setup_robots_txt_parser(url)

return robot_parser.can_fetch(user_agent, url)


def setup_robots_txt_parser(robots_txt_url: str) -> urllib.robotparser.RobotFileParser:
"""
Creates a RobotFileParser object from the given url to a robots.txt file
Args:
robot_txt_url (str): The URL to the robots.txt file.
Returns:
urllib.robotparser.RobotFileParser: The RobotFileParser object with the url already read.
"""

robot_parser = urllib.robotparser.RobotFileParser()
robot_parser.set_url(robots_txt_url)
robot_parser.read()

return robot_parser
39 changes: 14 additions & 25 deletions tests/networking/test_formatter.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME

def test_format_url() -> None:
assert (
format_url("/test", "http://example.com")
== "http://example.com/test"
)

assert (
format_url("http://example.com/test", "http://example.com")
== "http://example.com/test"
)
import pytest

assert (
format_url('path1/path2', 'http://example.com')
== 'http://example.com/path1/path2'
)

assert (
format_url('/path1/path2', 'http://example.com')
== 'http://example.com/path1/path2'
)
from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME

assert (
format_url('path.com', 'http://example.com')
== DEFAULT_SCHEME + 'path.com'
)
@pytest.mark.parametrize(
"url, base_url, expected",
[
("/test", "http://example.com", "http://example.com/test"),
("http://example.com/test", "http://example.com", "http://example.com/test"),
("path1/path2", "http://example.com", "http://example.com/path1/path2"),
("/path1/path2", "http://example.com", "http://example.com/path1/path2"),
("path.com", "http://example.com", f"{DEFAULT_SCHEME}path.com"),
]
)
def test_format_url(url: str, base_url: str, expected: str) -> None:
assert format_url(url, base_url) == expected
62 changes: 62 additions & 0 deletions tests/networking/test_robots_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from unittest.mock import patch, MagicMock
from io import BytesIO
import urllib.robotparser

import pytest

from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser

@pytest.mark.parametrize(
"url, expected",
[
("http://example", "http://example/robots.txt"),
("http://example/path", "http://example/robots.txt"),
("https://example/", "https://example/robots.txt"),
("http://example/path1/path2/path3/path4", "http://example/robots.txt"),
("http://example/path#fragment", "http://example/robots.txt"),
("http://example/path?query=test", "http://example/robots.txt"),
]
)
def test_get_robots_txt_url(url: str, expected: str) -> None:
assert get_robots_txt_url(url) == expected


@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nAllow: /"
mock_urlopen.return_value = BytesIO(mock_response)

assert is_robots_txt_allowed("http://example.com")


@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nDisallow: /"
mock_urlopen.return_value = BytesIO(mock_response)

assert not is_robots_txt_allowed("http://example.com")


@patch('urllib.request.urlopen')
def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None:
# Mock the response content of robots.txt
mock_response = b"User-agent: *\nDisallow: /private"

mock_urlopen.return_value = BytesIO(mock_response)
assert is_robots_txt_allowed("http://example.com")

mock_urlopen.return_value = BytesIO(mock_response)
assert not is_robots_txt_allowed("http://example.com/private")


def test_is_robots_txt_allowed_no_robots_txt() -> None:
# Check that websites with no robots.txt are set as crawlable
assert is_robots_txt_allowed("http://example.com")


def test_setup_robots_txt_parser() -> None:
robot_parser = setup_robots_txt_parser("http://example.com")

assert isinstance(robot_parser, urllib.robotparser.RobotFileParser)
Loading

0 comments on commit 6f5ea34

Please sign in to comment.