-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape all data from any website.py
56 lines (37 loc) · 1.31 KB
/
scrape all data from any website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
URL_list=[]
Temp_list=[]
def Process(URL):
print ( "Reading %s..." %(URL) )
counter = 0
html_page = urlopen(URL)
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll('a'):
if type( link.get('href') ) == str :
URL_list.append(link.get('href'))
counter=counter+1
print ("Found %d URLs on this page" %(counter) )
return counter
def Scrap_Text(url):
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")
file = open("Data.txt", "a")
for script in soup(["script","style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
file.write(text)
file.close()
print(text)
URL = input("input url to scrape eg(https://books.toscrape.com/) >> :")
URL_list.append(URL)
for item in URL_list:
if URL_list[URL_list.index(item)][0:4]=="http" or URL_list[URL_list.index(item)][0:4]=="www.":
Process(item)
Scrap_Text(item)
print(URL_list)
print ("size of list now is : %d" %(len(URL_list)) )