-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
209 lines (164 loc) · 8.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
import sys
import csv
from os.path import exists, isdir
from os import makedirs
man_string = """
███████╗ █████╗ ███████╗████████╗███████╗ ██████╗██████╗ █████╗ ██████╗ ███████╗
██╔════╝██╔══██╗██╔════╝╚══██╔══╝██╔════╝██╔════╝██╔══██╗██╔══██╗██╔══██╗██╔════╝
█████╗ ███████║███████╗ ██║ ███████╗██║ ██████╔╝███████║██████╔╝█████╗
██╔══╝ ██╔══██║╚════██║ ██║ ╚════██║██║ ██╔══██╗██╔══██║██╔═══╝ ██╔══╝
██║ ██║ ██║███████║ ██║ ███████║╚██████╗██║ ██║██║ ██║██║ ███████╗
╚═╝ ╚═╝ ╚═╝╚══════╝ ╚═╝ ╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚══════╝
V 0.7
Made by Strykez
options:
-h, --help, show this help message and exits
-m, -man, --manual
-u, --url sets the script's URL
-s, --selector the selector string used in the script
selector format --> Column_name:selector.class/another_selector.another_class
If column name is empty, it will append to the current column, else it will create a new column and append
the data to it
Examples: Titles:div.card/div.first_half/p.title --> Gets all the instances of p.title in the specified path
Titles:p.title --> Gets all the instances of p.title in the page
p.title --> If you do not want a column name
-o, --output the path where you want the results to be saved in .csv format (creates the directory/ies if necessary)
if left blank it will print the selected elements to the terminal
-v, --verbose displays more information about the steps performed in the script
NOTE: Put the verbose argument as the last argument because putting it ahead can make the script crash
"""
def arg_parser():
if len(sys.argv) == 1:
raise Exception('No options in the command line')
for arg in sys.argv[1::2]:
lower_arg = arg.lower()
next_arg = sys.argv.index(arg) + 1
global URL, file_path, selector, output_path, verbose_bool
if lower_arg == '-m' or lower_arg == '--manual' or lower_arg == '-man' or lower_arg == '-h' or lower_arg == '--help':
print(man_string)
exit()
elif lower_arg == '-u' or lower_arg == '--url':
URL = sys.argv[next_arg]
elif lower_arg == '-s' or lower_arg == '--selector':
selector = sys.argv[next_arg]
elif lower_arg == '-o' or lower_arg == '--output':
output_path = sys.argv[next_arg]
elif lower_arg == '-v' or lower_arg == '--verbose':
verbose_bool = True
else:
print(f'Argument {arg} not in options')
if URL == '':
raise Exception('URL not specified')
class ScrapeBot :
def __init__(self, URL, output, selector):
self.URL = URL
self.output = output
self.selector = selector
def process_selectors(self,selector_string):
self.temp = selector_string.split(':')
global column_name, selector_array, class_array
if (len(self.temp) == 1):
for element in self.temp[0].split('/'):
element = element.split('.')
selector_array.append(element[0])
try:
class_array.append(element[1])
except:
class_array.append('NONE')
else:
column_name = self.temp[0]
for element in self.temp[1].split('/'):
element = element.split('.')
selector_array.append(element[0])
try:
class_array.append(element[1])
except:
class_array.append('NONE')
def find_all_by_index(self, bs, index):
global selector_array, class_array
if class_array[index] == 'NONE':
source = bs.find_all(selector_array[index])
else:
source = bs.find_all(selector_array[index], class_= class_array[index])
return source
def recursive_search(self, source, level):
global parser, final_element_array
bs = BeautifulSoup(str(source), parser)
next_source = self.find_all_by_index(bs,level) # array of html code snippets that includes the html selector at the level position with it's subsequent class
if next_source != []:
for element in next_source:
if level == len(selector_array)-1:
final_element_array.append(element.text)
else:
self.recursive_search(element, level+1)
def print_final_elements(self, final_element_array, column_name):
if column_name != '':
print(column_name)
for element in final_element_array:
print(element)
def save_to_csv(self, final_element_array, column_name, output_path):
output_path = output_path.strip('/')
file_name = output_path.split('/')[-1]
output_path = output_path.strip(file_name)
if not isdir(output_path):
makedirs(output_path)
output_path = output_path + file_name
if exists(output_path):
final_data = []
if column_name == '':
with open (output_path, 'a', newline='') as csv_output:
writer = csv.writer(csv_output)
for element in final_element_array:
writer.writerow([element])
else:
final_element_array.insert(0,column_name)
with open (output_path, 'r') as csv_input:
reader = csv.reader(csv_input)
for index, row in enumerate(reader):
try:
row = row + [final_element_array[index]]
final_data.append(row)
except:
final_data.append(row)
with open (output_path, 'w', newline='') as csv_output:
writer = csv.writer(csv_output)
writer.writerows(final_data)
else:
with open(output_path, 'w', newline='') as csv_output:
writer = csv.writer(csv_output)
if (column_name != ''):
writer.writerow([column_name])
for element in final_element_array:
writer.writerow([element])
if __name__ == '__main__':
URL, selector, output_path, column_name = '', '', '', ''
verbose_bool = False
selector_array, class_array, final_element_array = [], [], []
parser = 'html.parser'
arg_parser()
if selector == '':
raise Exception('No selector option in the command line')
if verbose_bool:
print(f'✔️ Initializing script with the following arguments: URL: {URL}, selector: {selector}, output path: {output_path}')
page_request = requests.get(URL)
if verbose_bool:
print(f'✔️ Code of the request is: {page_request}')
source = page_request.text
if verbose_bool:
print('✔️ Initializing scraper...')
scraper = ScrapeBot(URL, output_path, selector)
scraper.process_selectors(str(selector))
if verbose_bool:
print('✔️ Searching the selectors...')
scraper.recursive_search(source, 0)
if output_path == '':
if verbose_bool:
print('✔️ Outputting to screen...')
scraper.print_final_elements(final_element_array, column_name)
else:
if verbose_bool:
print(f'✔️ Saving to csv file in path: {output_path}')
scraper.save_to_csv(final_element_array, column_name, output_path)