-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl-sync.py
123 lines (106 loc) · 4.06 KB
/
crawl-sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import argparse
import logging
# Set up logging
logging.basicConfig(format='%(process)d:%(levelname)s:%(module)s:%(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser(description="Crawl a website")
parser.add_argument("url", help="The URL of the website to crawl")
parser.add_argument("--domains", nargs="+", help="List of allowed domains")
visited_urls = set()
allowed_domains = set()
def fetch_page(url):
try:
# Change schema to http
url = re.sub(r'^https', 'http', url)
logging.info(f"Fetching {url}")
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Failed to fetch {url}: {e}")
return None
def extract_links_from_html(html, base_url):
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
logging.error(f"Failed to parse {base_url}: {e}")
return set()
links = set()
# Extract links from <a> tags
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(base_url, href)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
# Extract links from <link> tags (CSS files)
for link_tag in soup.find_all('link', href=True):
href = link_tag['href']
full_url = urljoin(base_url, href)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
# Extract links from <script> tags (JS files)
for script_tag in soup.find_all('script', src=True):
src = script_tag['src']
full_url = urljoin(base_url, src)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
# Extract links from <img> tags (images)
for img_tag in soup.find_all('img', src=True):
src = img_tag['src']
full_url = urljoin(base_url, src)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
# Extract links from <video> tags (video files)
for video_tag in soup.find_all('video', src=True):
src = video_tag['src']
full_url = urljoin(base_url, src)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
# Extract links from <audio> tags (audio files)
for audio_tag in soup.find_all('audio', src=True):
src = audio_tag['src']
full_url = urljoin(base_url, src)
if urlparse(full_url).netloc in allowed_domains:
links.add(full_url)
logging.info(f"Found {len(links)} links on {base_url}")
return links
def extract_links_from_text(text, base_url):
links = set()
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
matches = re.findall(regex, text)
for match in matches:
full_url = urljoin(base_url, match)
if urlparse(full_url).netloc == urlparse(base_url).netloc:
links.add(full_url)
logging.info(f"Found {len(links)} links on {base_url}")
return links
def crawl(start_url):
queue = {start_url}
while queue:
logging.info(f"{start_url} - Queue: {len(queue)}, Visited: {len(visited_urls)}")
url = queue.pop()
if url in visited_urls:
continue
text = fetch_page(url)
if text:
visited_urls.add(url)
# time.sleep(0.05)
if url.endswith('.js') or url.endswith('.css'):
links = extract_links_from_text(text, url)
else:
links = extract_links_from_html(text, url)
queue.update(links)
if __name__ == "__main__":
args = parser.parse_args()
start_url = args.url
if args.domains:
allowed_domains.update(args.domains)
logging.info(f"Start crawling at {start_url}")
logging.info(f"Allowed domains: {allowed_domains}")
crawl(start_url)
logging.info(f"Crawling complete with {len(visited_urls)} visited URLs")