-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawl.py
More file actions
100 lines (75 loc) · 3.15 KB
/
crawl.py
File metadata and controls
100 lines (75 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
import time
import subprocess
import os
from bs4 import BeautifulSoup
pages=[]
titles=[]
prices=[]
rating=[]
entry= {'titles' : titles, 'prices' : prices, 'ratings' : rating, 'pages' : pages}
count_page=1
option=input('''Enter the options\n 1. Enter the name of a website to extract all links\n 2. Extracting data like price, Titles, Rating "from books.toscrape.com"\n 3. List the subdomains of a website\n 4. List all directories of a website\n''')
def subdomains():
url= input('enter the website name-->')
cmd='python sublist3r.py -d {} -o list.txt'.format(url)
subprocess.call(cmd)
os.system("list.txt")
while True:
if option=='3':
subdomains()
elif option=='4':
url=input("enter the website name-->")
with open("directories.txt", 'r') as file:
print("****list of Directories***\n")
for line in file:
line = line.strip()
test_url = url + '/' + line
response = requests.get("http://"+ test_url)
if response:
print(test_url)
elif option=='2':
for i in range(1, count_page+1): # to extract all the pages
url = 'http://books.toscrape.com/catalogue/page-'+ str(i) +'.html'
pages.append(url)
##print(pages)
for item in pages:
page = requests.get(item)
soup = BeautifulSoup(page.text, 'html.parser')
#(soup.prettify())
for i in soup.find_all('h3'):
ttl = i.getText()
titles.append(ttl)
for i in soup.find_all('p' , class_='price_color'):
rates = i.getText()
prices.append(rates)
for i in soup.find_all('p' , class_='star-rating'):
for k,v in i.attrs.items():
#print(k,v)
star=v[1]
rating.append(star)
div = soup.find_all('div', class_='image_container')
for thumb in div:
tgs= thumb.find_all('img', class_='thumbnail')
#url= 'http://books.toscrape.com/' + str(tgs['src'])
#print(url)
print('**********Title of the books********\n')
print(titles)
print('**********Prices of the books********\n')
print(prices)
print('**********Rating of the books********\n')
print(rating)
elif option=='1':
url = input('enter the website name like example.com -->')
page = requests.get('http://'+ url)
data = page.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
time.sleep(1)
print(link.get('href'))
value=input('Do you want to continue y/n--->')
if value=='y':
option=input('''Enter the options\n 1. Enter the name of a website to extract all links\n 2. Extracting data like price, Titles, Rating "from books.toscrape.com"\n 3. List the subdomains of a website\n 4. List all directories of a website\n''')
continue
else:
break