-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
139 lines (117 loc) · 4.66 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import requests
from bs4 import BeautifulSoup
import sys
import re
import urllib.parse
from colorama import Fore, Back, Style
import datetime
from prettytable import PrettyTable, MARKDOWN
import argparse
import shutil
import time
from urllib.parse import urljoin
import logging
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
return r.text
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
sys.exit(1)
def convert_to_absolute_url(url, base_url):
return urllib.parse.urljoin(base_url, url)
def get_links(html, base_url):
soup = BeautifulSoup(html, 'html.parser')
links = set()
for link in soup.find_all('a', href=True):
url = convert_to_absolute_url(link['href'], base_url)
if '://' in url and urllib.parse.urlparse(base_url).netloc != urllib.parse.urlparse(url).netloc:
continue
links.add(url)
return links
def get_resources(html, base_url, args):
soup = BeautifulSoup(html, 'html.parser')
resources = []
for res in soup.find_all(src=True):
url = urljoin(base_url, res['src'])
if url.split('.')[-1] in args.extensions:
resources.append(url)
return resources
def download_resources(folder, resources):
# Initialize the table
table = PrettyTable(["Date", "Status", "Resource"])
for resource in resources:
filename = os.path.join(folder, resource.split('/')[-1])
if os.path.isdir(filename):
Date = datetime.datetime.now()
table.add_row([Date, Fore.BLUE + "Skipped (Directory)" + Fore.RESET, filename])
continue
try:
r = requests.get(resource, stream=True)
r.raise_for_status()
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
Date = datetime.datetime.now()
table.add_row([Date, Fore.GREEN + "Downloaded" + Fore.RESET, filename])
except requests.exceptions.RequestException as e:
Date = datetime.datetime.now()
table.add_row([Date, Fore.RED + "Error" + Fore.RESET, str(e)])
# Write the table to a text file
with open('output.txt', 'w') as f:
f.write(str(table))
return table.get_string(format=MARKDOWN)
def main():
start_time = time.time()
parser = argparse.ArgumentParser(description="Download resources from a website.")
parser.add_argument("url", help="The URL of the website to download resources from.")
parser.add_argument("folder", help="The folder to download resources to.")
parser.add_argument("--delete", action="store_true", help="Delete the folder after downloading resources.")
parser.add_argument("--extensions", nargs='+', default=[], help="The file extensions to download. If not provided, all resources will be downloaded.")
parser.add_argument("--log", type=str, help="The name of the log file.")
args = parser.parse_args()
if args.log:
logging.basicConfig(filename=args.log, level=logging.INFO)
url = args.url
folder = args.folder
if not os.path.exists(folder):
os.makedirs(folder)
html = get_html(url)
links = get_links(html, url)
resources = get_resources(html, url, args)
markdown_table = download_resources(folder, resources)
with open('output.md', 'w') as f:
f.write(markdown_table)
print(markdown_table)
if args.log:
logging.info(markdown_table)
# If the --delete option was provided, delete the folder
if args.delete:
print(Fore.RED + "Deleting folder..." + Fore.RESET)
shutil.rmtree(folder)
print(Fore.GREEN + "Folder deleted successfully" + Fore.RESET)
if args.log:
logging.info("Folder deleted successfully")
end_time = time.time()
elapsed_time = end_time - start_time
print(Fore.LIGHTYELLOW_EX + f"System run time: {elapsed_time} seconds" + Fore.RESET)
if args.log:
logging.info(f"System run time: {elapsed_time} seconds")
if not args.delete:
for link in links:
html = get_html(link)
resources = get_resources(html, link, args)
markdown_table = download_resources(folder, resources)
print(markdown_table)
if args.log:
logging.info(markdown_table)
end_time = time.time()
elapsed_time = end_time - start_time
print(Fore.LIGHTYELLOW_EX + f"System run time: {elapsed_time} seconds" + Fore.RESET)
if args.log:
logging.info(f"System run time: {elapsed_time} seconds")
if __name__ == '__main__':
main()