-
Notifications
You must be signed in to change notification settings - Fork 4
/
douban_online.py
86 lines (79 loc) · 2.93 KB
/
douban_online.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#-*-coding:utf-8-*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import urllib2
import re
from bs4 import BeautifulSoup
import time
import math
def delpunc(s):
exclude = ',/\\();:"\'._~!#$%^&*{}[]?><'
s = ''.join(x for x in s if x not in exclude)
return re.sub(u"[,。!?【】()]", "", s)
# print delpunc(a)
i = 0
a = 0
for i in range(0, 160):
html = 'http://www.douban.com/online/tag/?start=%d' % (i)
i = i + 160
print html
content = urllib2.urlopen(html, timeout=1000).read()
soup = BeautifulSoup(content)
foundtags = soup.find(class_="tag-table").findAll('td')
file = open('1.txt', 'w')
for tagtag in foundtags:
tag = tagtag.a.string
file.write(str(tag.encode('utf-8')) + "\n")
# print tag
num = tagtag.span.string
# print type(num)
num = int(re.findall(r'\d+', num)[0])
lim = num / 10 + 1
j = 0
for j in range(0, lim + 1):
url = ('http://www.douban.com/online/tag/' + tag).encode('utf-8')
url = (url + '?start=%d') % (j * 10)
j = j + 1
print url, j
content1 = urllib2.urlopen(url, timeout=1000).read()
soup1 = BeautifulSoup(content1)
tags1 = soup1.findAll("h3")
for tagtag1 in tags1:
a += 1
tagtag1 = tagtag1.findAll('a')[0]
tag1 = tagtag1.string
tag2 = delpunc(tag1)
print a, tag2.encode('utf-8').decode('utf-8')
# a.append()
# print a
id = re.findall(
r'<a href="http://www.douban.com/online/(\d+)/">', str(tagtag1))[0]
file.write(id + "\t")
file.write(delpunc(str(tag2)) + "\t")
eachhtml = 'http://www.douban.com/online/' + id + '/'
content2 = urllib2.urlopen(eachhtml, timeout=1000).read()
soup2 = BeautifulSoup(content2)
org = soup2.find(class_='info')
org = org.find('h3')
uid = re.findall(
r'<a href="http://www.douban.com/people/(.*?)/">', str(org))
if len(uid):
uid = uid[0]
else:
uid = re.findall(
r'<a href="http://site.douban.com/(.*?)/">', str(org))[0]
file.write(uid + '\t')
phtml = eachhtml + 'participant'
contentp = urllib2.urlopen(phtml, timeout=1000).read()
soupp = BeautifulSoup(contentp)
p = soupp.findAll('dd')
for ptag in p:
pp = ptag.findAll('a')
puid = re.findall(
r'<a href="http://www.douban.com/people/(.*?)/', str(pp))[0]
file.write(puid + '\t')
time.sleep(0.5)
file.write('\n' + '\n')
time.sleep(2)
file.close()