-
Notifications
You must be signed in to change notification settings - Fork 0
/
gtp.py
51 lines (44 loc) · 1.58 KB
/
gtp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import re
import time
history = []
while(True):
link = input("url: ")
if(link != ""):
break
history.append(link)
r = requests.get(link)
time.sleep(0.5)
soup = BeautifulSoup(r.content, features = 'html.parser')
while soup.find(id = 'firstHeading').get_text() != 'Philosophy':
for x in soup.find_all(['table']): # remove tables
x.replace_with("")
y = soup.find_all('p', {"class" : ""})
for i in range(len(y) ):
for s in y[i].find_all(href = re.compile('redlink=1$')): # remove redlinks
s.replace_with("")
for s in y[i].find_all(['i']): # remove italics
s.replace_with("")
s = str(y[i])
s = re.sub(r'\s+\((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*\)', '', s) # remove parenthesized text
y[i] = BeautifulSoup(s, features = 'html.parser') # re-convert to soup object
flag = True
for i in range (len(y)): # go to paragraph that has a wiki link
if y[i].find(href = re.compile('/wiki/')) != None:
content = y[i]
flag = False # found a paragraph that has a wiki link
break
if(flag == True):
print("no link exists!")
break
firstLink = content.find(href = re.compile('^/wiki/')) # links that start with /wiki/ only
link = 'http://en.wikipedia.org' + firstLink.get('href')
if link in history:
print("loop exists!")
break
print(link)
history.append(link)
r = requests.get(link)
time.sleep(0.5)
soup = BeautifulSoup(r.content, features = 'html.parser')