-
Notifications
You must be signed in to change notification settings - Fork 0
/
khan.py
134 lines (121 loc) · 3.95 KB
/
khan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#-*- coding:utf-8 -*-
import sys
import urllib
import urllib2
import re
import tool
import os
#抓取khan公开课类
class Spider:
#页面初始化
def __init__(self):
self.siteURL = 'http://open.163.com/khan/'
self.tool = tool.Tool()
def getPage(self):
try:
url = self.siteURL
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
except urllib2.URLError,e:
if hasattr(e,"reason"):
print u"Fail",e.reason
return None
#获取索引页面所有课程信息
def getContents(self):
page = self.getPage()
pattern = re.compile('<div class="g-cell1 g-card1">.*?<a href="(.*?)" .*?',re.S)
items = re.findall(pattern,page)
if items:
contents = []
for item in items:
contents.append(item)
return contents
else:
print null
return None
#获取课程具体详情页面
def getDetailPage(self,infoURL):
response = urllib2.urlopen(infoURL)
return response.read()
#获取课程名称
def getCourseName(self,page):
pattern = re.compile('<div class="m-cdes">.*?<h2>(.*?)</h2>',re.S)
result = re.search(pattern,page)
if result:
return self.tool.replace(result.group(1))
else:
return None
#获取课程讲师
def getTeacher(self,page):
pattern = re.compile('<div class="m-cteacher">.*? ',re.S)
result = re.search(pattern,page)
if result:
return self.tool.replace(result.group(1))
else:
return None
#获取课程简介
def getBrief(self,page):
pattern = re.compile('<div class="m-cdes">.*?</h2>(.*?)<b>',re.S)
result = re.search(pattern,page)
if result:
return self.tool.replace(result.group(1))
else:
return None
#获取课程列表
def getCourseList(self,page):
pattern = re.compile(' <td class="u-ctitle">.*?(.*?)<a href=.*?>(.*?)</a>',re.S)
list = re.findall(pattern,page)
if list:
contents = []
for li in list:
contents.append([li[0],li[1]])
return contents
else:
return None
#获取跟帖人数
def getTieShow(self,page):
pattern = re.compile('<div class="tie-titlebar">.*?<a href.*?>(.*?)</a>',re.S)
result = re.search(pattern,page)
if result:
return self.tool.replace(result.group(1))
else:
return None
#保存信息
def saveInfo(self):
contents = self.getContents()
for item in contents:
print "find",item
#获取课程具体详情URL
detailURL = item
#获取课程具体详情页面代码
detailPage = self.getDetailPage(detailURL)
#获取课程名称
name = self.getCourseName(detailPage)
#获取课程简介
brief = self.getBrief(detailPage)
print brief
#保存课程名称
if(name != None):
f.write("CourseName:" + name + '\n')
#保存课程介绍
if(brief != None):
f.write("CourseBrief:" + brief + '\n')
# teacher = self.getTeacher(detailPage)
#获取课程列表
list = self.getCourseList(detailPage
if list:
for li in list:
li[0] = self.tool.replace(li[0])
f.write(li[0] + li[1] + '\n')
#获取跟帖人数
tiePerson = self.getTieShow(detailPage)
# #保存课程讲师
# if(teacher != None):
# f.write("CourseTeacher:" + teacher + '\n')
f.write('\n')
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("khan.txt","w+")
spider = Spider()
spider.saveInfo()