Skip to content

Commit 45faf47

Browse files
committed
feat: better robots.txt parsing by using urlib
close: #36
1 parent a6d68c8 commit 45faf47

File tree

1 file changed

+4
-1
lines changed

1 file changed

+4
-1
lines changed

wikiteam3/utils/wiki_avoid.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import sys
3+
import urllib.robotparser
34
from urllib.parse import urlparse
45

56
import requests
@@ -29,14 +30,16 @@ def avoid_robots_disallow(config: Config, other: OtherConfig):
2930
"""Check if the robots.txt allows the download"""
3031
url = config.api or config.index
3132
exit_ = False
33+
bot = urllib.robotparser.RobotFileParser()
3234
try:
3335
# Don't use the session.get() method here, since we want to avoid the session's retry logic
3436
r = requests.get(
3537
urlparse(url).scheme + '://' + urlparse(url).netloc + '/robots.txt',
3638
cookies=other.session.cookies, headers=other.session.headers, verify=other.session.verify, proxies=other.session.proxies
3739
)
3840
if r.status_code == 200:
39-
if 'user-agent: wikiteam3\ndisallow: /' in r.text.lower():
41+
bot.parse(r.text.splitlines())
42+
if not bot.can_fetch('wikiteam3', '/') and 'wikiteam3' in r.text:
4043
print('This wiki not allow wikiteam3 to archive.')
4144
exit_ = True
4245
except Exception as e:

0 commit comments

Comments
 (0)