-
Notifications
You must be signed in to change notification settings - Fork 0
/
textScrapper.py
66 lines (58 loc) · 1.67 KB
/
textScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# coding: utf8
import requests
from bs4 import BeautifulSoup
o = open('test9.txt', 'w')
all_links = []
target = 'http://tvoyslot.com'
def homepage():
r = requests.get(target)
soup = BeautifulSoup(r.content, 'lxml')
for link in soup.find_all('a'):
if link.get('href')[0:4] != 'http' and len(link.get('href'))>1:
print(link.get('href'))
all_links.append(link.get('href'))
elif link.get('href')[:len(target)] == target and len(link.get('href'))>1:
print(link.get('href'))
all_links.append(link.get('href'))
def linker():
for i in all_links:
if i[0:4] == 'http' and i[0:4] != 'java':
r = requests.get(i)
elif i[0:4] != 'java':
newi = target+i
r = requests.get(newi)
soup = BeautifulSoup(r.content, 'lxml')
for link in soup.find_all('a'):
try:
if link.get('href')[0:4] != 'http' and len(link.get('href'))>1 and i[0:4] != 'java':
if link.get('href') not in all_links:
print(link.get('href'))
all_links.append(link.get('href'))
elif link.get('href')[:len(target)] == target and len(link.get('href'))>1 and i[0:4] != 'java':
if link.get('href') not in all_links:
print(link.get('href'))
all_links.append(link.get('href'))
except:
pass
def pscraper():
for link in set(all_links):
#print(link)
if link[0:4] == 'http' and link[0:4] != 'java':
r = requests.get(link)
else:
newtarget = target + '/' + link
r = requests.get(newtarget)
soup = BeautifulSoup(r.content, 'lxml')
for text in soup.find_all('p'):
if len(text.text) >= 40:
try:
o.write(text.text+'\n')
except:
pass
#print(all_links)
#print(len(set(all_links)))
if len(all_links) == 0:
homepage()
linker()
pscraper()
o.close()