-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #45 from Mews/respect-robots-txt
Feature: Respect robots txt
- Loading branch information
Showing
6 changed files
with
358 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import urllib.parse | ||
import urllib.robotparser | ||
from typing import Optional | ||
import requests | ||
|
||
def get_robots_txt_url(url: str) -> str: | ||
""" | ||
Returns a url to a robots.txt file from the provided url. | ||
Args: | ||
url (str): The URL to get the robots.txt of. | ||
Returns: | ||
str: The robots.txt url. | ||
""" | ||
|
||
parsed_url = urllib.parse.urlparse(url) | ||
|
||
return parsed_url.scheme + "://"+ parsed_url.netloc + "/robots.txt" | ||
|
||
|
||
def is_robots_txt_allowed(url: str, robot_parser: Optional[urllib.robotparser.RobotFileParser] = None) -> bool: | ||
""" | ||
Checks if the provided URL can be crawled, according to its corresponding robots.txt file | ||
Args: | ||
url (str): The URL to check. | ||
Returns: | ||
bool: True if the URL can be crawled, False otherwise. | ||
""" | ||
|
||
user_agent = requests.utils.default_user_agent() | ||
|
||
if robot_parser is None: | ||
robot_parser = setup_robots_txt_parser(url) | ||
|
||
return robot_parser.can_fetch(user_agent, url) | ||
|
||
|
||
def setup_robots_txt_parser(robots_txt_url: str) -> urllib.robotparser.RobotFileParser: | ||
""" | ||
Creates a RobotFileParser object from the given url to a robots.txt file | ||
Args: | ||
robot_txt_url (str): The URL to the robots.txt file. | ||
Returns: | ||
urllib.robotparser.RobotFileParser: The RobotFileParser object with the url already read. | ||
""" | ||
|
||
robot_parser = urllib.robotparser.RobotFileParser() | ||
robot_parser.set_url(robots_txt_url) | ||
robot_parser.read() | ||
|
||
return robot_parser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,16 @@ | ||
from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME | ||
|
||
def test_format_url() -> None: | ||
assert ( | ||
format_url("/test", "http://example.com") | ||
== "http://example.com/test" | ||
) | ||
|
||
assert ( | ||
format_url("http://example.com/test", "http://example.com") | ||
== "http://example.com/test" | ||
) | ||
import pytest | ||
|
||
assert ( | ||
format_url('path1/path2', 'http://example.com') | ||
== 'http://example.com/path1/path2' | ||
) | ||
|
||
assert ( | ||
format_url('/path1/path2', 'http://example.com') | ||
== 'http://example.com/path1/path2' | ||
) | ||
from tiny_web_crawler.networking.formatter import format_url, DEFAULT_SCHEME | ||
|
||
assert ( | ||
format_url('path.com', 'http://example.com') | ||
== DEFAULT_SCHEME + 'path.com' | ||
) | ||
@pytest.mark.parametrize( | ||
"url, base_url, expected", | ||
[ | ||
("/test", "http://example.com", "http://example.com/test"), | ||
("http://example.com/test", "http://example.com", "http://example.com/test"), | ||
("path1/path2", "http://example.com", "http://example.com/path1/path2"), | ||
("/path1/path2", "http://example.com", "http://example.com/path1/path2"), | ||
("path.com", "http://example.com", f"{DEFAULT_SCHEME}path.com"), | ||
] | ||
) | ||
def test_format_url(url: str, base_url: str, expected: str) -> None: | ||
assert format_url(url, base_url) == expected |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from unittest.mock import patch, MagicMock | ||
from io import BytesIO | ||
import urllib.robotparser | ||
|
||
import pytest | ||
|
||
from tiny_web_crawler.networking.robots_txt import get_robots_txt_url, is_robots_txt_allowed, setup_robots_txt_parser | ||
|
||
@pytest.mark.parametrize( | ||
"url, expected", | ||
[ | ||
("http://example", "http://example/robots.txt"), | ||
("http://example/path", "http://example/robots.txt"), | ||
("https://example/", "https://example/robots.txt"), | ||
("http://example/path1/path2/path3/path4", "http://example/robots.txt"), | ||
("http://example/path#fragment", "http://example/robots.txt"), | ||
("http://example/path?query=test", "http://example/robots.txt"), | ||
] | ||
) | ||
def test_get_robots_txt_url(url: str, expected: str) -> None: | ||
assert get_robots_txt_url(url) == expected | ||
|
||
|
||
@patch('urllib.request.urlopen') | ||
def test_is_robots_txt_allowed_true(mock_urlopen: MagicMock) -> None: | ||
# Mock the response content of robots.txt | ||
mock_response = b"User-agent: *\nAllow: /" | ||
mock_urlopen.return_value = BytesIO(mock_response) | ||
|
||
assert is_robots_txt_allowed("http://example.com") | ||
|
||
|
||
@patch('urllib.request.urlopen') | ||
def test_is_robots_txt_allowed_false(mock_urlopen: MagicMock) -> None: | ||
# Mock the response content of robots.txt | ||
mock_response = b"User-agent: *\nDisallow: /" | ||
mock_urlopen.return_value = BytesIO(mock_response) | ||
|
||
assert not is_robots_txt_allowed("http://example.com") | ||
|
||
|
||
@patch('urllib.request.urlopen') | ||
def test_is_robots_txt_allowed_mixed(mock_urlopen: MagicMock) -> None: | ||
# Mock the response content of robots.txt | ||
mock_response = b"User-agent: *\nDisallow: /private" | ||
|
||
mock_urlopen.return_value = BytesIO(mock_response) | ||
assert is_robots_txt_allowed("http://example.com") | ||
|
||
mock_urlopen.return_value = BytesIO(mock_response) | ||
assert not is_robots_txt_allowed("http://example.com/private") | ||
|
||
|
||
def test_is_robots_txt_allowed_no_robots_txt() -> None: | ||
# Check that websites with no robots.txt are set as crawlable | ||
assert is_robots_txt_allowed("http://example.com") | ||
|
||
|
||
def test_setup_robots_txt_parser() -> None: | ||
robot_parser = setup_robots_txt_parser("http://example.com") | ||
|
||
assert isinstance(robot_parser, urllib.robotparser.RobotFileParser) |
Oops, something went wrong.