Skip to content

Commit 1c91498

Browse files
committed
feat: better robots.txt parsing by using urlib
close: #36
1 parent a6d68c8 commit 1c91498

File tree

2 files changed

+125
-1
lines changed

2 files changed

+125
-1
lines changed

tests/test_wiki_avoid.py

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import unittest
2+
from unittest.mock import patch, MagicMock
3+
from urllib.parse import urlparse
4+
import requests
5+
from wikiteam3.dumpgenerator.config import Config, OtherConfig
6+
from wikiteam3.utils.wiki_avoid import avoid_robots_disallow
7+
8+
# filepath: wikiteam3/utils/test_wiki_avoid.py
9+
10+
11+
12+
13+
class TestAvoidRobotsDisallow(unittest.TestCase):
14+
15+
@patch('wikiteam3.utils.wiki_avoid.sys.exit')
16+
@patch('wikiteam3.utils.wiki_avoid.requests.get')
17+
@patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser')
18+
def test_avoid_robots_disallow_allowed(self, mock_robotparser, mock_requests_get, mock_sys_exit):
19+
"""Test when robots.txt allows the user agent"""
20+
config = Config()
21+
config.api = "http://example.com/w/api.php"
22+
other = MagicMock()
23+
other.session = requests.Session()
24+
25+
mock_response = MagicMock()
26+
mock_response.status_code = 200
27+
mock_response.text = "User-agent: *\nAllow: /"
28+
mock_requests_get.return_value = mock_response
29+
30+
mock_bot = MagicMock()
31+
mock_bot.can_fetch.return_value = True
32+
mock_robotparser.return_value = mock_bot
33+
34+
avoid_robots_disallow(config, other)
35+
36+
mock_requests_get.assert_called_once()
37+
mock_bot.parse.assert_called_once()
38+
self.assertEqual(mock_sys_exit.call_count, 0)
39+
40+
@patch('wikiteam3.utils.wiki_avoid.sys.exit')
41+
@patch('wikiteam3.utils.wiki_avoid.requests.get')
42+
@patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser')
43+
def test_avoid_robots_disallow_disallowed(self, mock_robotparser, mock_requests_get, mock_sys_exit):
44+
"""Test when robots.txt disallows the user agent"""
45+
config = Config()
46+
config.api = "http://example.com/w/api.php"
47+
other = MagicMock()
48+
other.session = requests.Session()
49+
50+
mock_response = MagicMock()
51+
mock_response.status_code = 200
52+
mock_response.text = "User-agent: wikiteam3\nDisallow: /"
53+
mock_requests_get.return_value = mock_response
54+
55+
mock_bot = MagicMock()
56+
mock_bot.can_fetch.return_value = False
57+
mock_robotparser.return_value = mock_bot
58+
59+
avoid_robots_disallow(config, other)
60+
61+
mock_requests_get.assert_called_once()
62+
mock_bot.parse.assert_called_once()
63+
mock_sys_exit.assert_called_once_with(20)
64+
65+
@patch('wikiteam3.utils.wiki_avoid.sys.exit')
66+
@patch('wikiteam3.utils.wiki_avoid.requests.get')
67+
@patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser')
68+
def test_avoid_robots_disallow_error(self, mock_robotparser, mock_requests_get, mock_sys_exit):
69+
"""Test when there is an error fetching robots.txt"""
70+
config = Config()
71+
config.api = "http://example.com/w/api.php"
72+
other = MagicMock()
73+
other.session = requests.Session()
74+
75+
mock_requests_get.side_effect = Exception("Test exception")
76+
77+
avoid_robots_disallow(config, other)
78+
79+
mock_requests_get.assert_called_once()
80+
self.assertEqual(mock_robotparser.call_count, 1)
81+
self.assertEqual(mock_sys_exit.call_count, 0)
82+
83+
@patch('wikiteam3.utils.wiki_avoid.sys.exit')
84+
@patch('wikiteam3.utils.wiki_avoid.requests.get')
85+
@patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser')
86+
def test_avoid_robots_disallow_robots_not_found(self, mock_robotparser, mock_requests_get, mock_sys_exit):
87+
"""Test when robots.txt returns a 404"""
88+
config = Config()
89+
config.api = "http://example.com/w/api.php"
90+
other = MagicMock()
91+
other.session = requests.Session()
92+
93+
mock_response = MagicMock()
94+
mock_response.status_code = 404
95+
mock_requests_get.return_value = mock_response
96+
97+
avoid_robots_disallow(config, other)
98+
99+
mock_requests_get.assert_called_once()
100+
self.assertEqual(mock_robotparser.call_count, 1)
101+
self.assertEqual(mock_sys_exit.call_count, 0)
102+
103+
@patch('wikiteam3.utils.wiki_avoid.sys.exit')
104+
@patch('wikiteam3.utils.wiki_avoid.requests.get')
105+
@patch('wikiteam3.utils.wiki_avoid.urllib.robotparser.RobotFileParser')
106+
def test_avoid_robots_disallow_no_api_index(self, mock_robotparser, mock_requests_get, mock_sys_exit):
107+
"""Test when both config.api and config.index are None"""
108+
config = Config()
109+
config.api = None
110+
config.index = None
111+
other = MagicMock()
112+
other.session = requests.Session()
113+
114+
avoid_robots_disallow(config, other)
115+
116+
self.assertEqual(mock_requests_get.call_count, 0)
117+
self.assertEqual(mock_robotparser.call_count, 1)
118+
self.assertEqual(mock_sys_exit.call_count, 0)
119+
120+
if __name__ == '__main__':
121+
unittest.main()

wikiteam3/utils/wiki_avoid.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import sys
3+
import urllib.robotparser
34
from urllib.parse import urlparse
45

56
import requests
@@ -29,14 +30,16 @@ def avoid_robots_disallow(config: Config, other: OtherConfig):
2930
"""Check if the robots.txt allows the download"""
3031
url = config.api or config.index
3132
exit_ = False
33+
bot = urllib.robotparser.RobotFileParser()
3234
try:
3335
# Don't use the session.get() method here, since we want to avoid the session's retry logic
3436
r = requests.get(
3537
urlparse(url).scheme + '://' + urlparse(url).netloc + '/robots.txt',
3638
cookies=other.session.cookies, headers=other.session.headers, verify=other.session.verify, proxies=other.session.proxies
3739
)
3840
if r.status_code == 200:
39-
if 'user-agent: wikiteam3\ndisallow: /' in r.text.lower():
41+
bot.parse(r.text.splitlines())
42+
if not bot.can_fetch('wikiteam3', '/') and 'wikiteam3' in r.text:
4043
print('This wiki not allow wikiteam3 to archive.')
4144
exit_ = True
4245
except Exception as e:

0 commit comments

Comments
 (0)