-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathratesAndDB.py
130 lines (101 loc) · 2.97 KB
/
ratesAndDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
import readCSV
import getchlistfromhtml
import collections
import MySQLdb
#urlArr = readCSV.getUrl("browserHistory.csv")
urlArr = ["http://blog.csdn.net/nevasun/article/details/7331644"]
# build dictionary for the whole words
csdn = {}
zhihu = {}
buptbbs = {}
ustcbbs = {}
baiduQA = {}
othersites = {}
# get rates of every word in each table(classify)
for url in urlArr:
# from url get the dic_name & get rates of every word
if(url.find('csdn')!=-1):
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in csdn:
csdn[word] = 0
csdn[word] += 1
elif(url.find('zhihu')!=-1):
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in zhihu:
zhihu[word] = 0
zhihu[word] += 1
elif(url.find('bupt')!=-1):
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in csdn:
buptbbs[word] = 0
buptbbs[word] += 1
elif(url.find('ustc')!=-1):
filename = "ustcbbs.txt"
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in csdn:
ustcbbs[word] = 0
ustcbbs[word] += 1
elif(url.find('baidu')!=-1):
filename = "baiduQA.txt"
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in csdn:
baiduQA[word] = 0
baiduQA[word] += 1
else:
filename = "othersites.txt"
wdlist = getchlistfromhtml.getChListFromHtml(url)
for word in wdlist:
if word not in csdn:
othersites[word] = 0
othersites[word] += 1
#restore data into database
#database: postana
#tables:csdntable,zhihutable......
# in2db(csdn,csdntable)
def in2db(dic,tablename):
print "tablename : ",tablename
dic = collections.OrderedDict(sorted(dic.items(), key = lambda t: -t[1]))
''' # it wrong!!! in "for" cycle, value change only useful inside of 'for'cycle,but not change the value in dic
# transfer count to rate
sum = 0
for value in dic.values():
sum += value
for key,value in dic.viewitems():
value = float(value)/sum
print "dic---- ",key," : ",value
'''
try:
conn = MySQLdb.connect(host="localhost",user="root",passwd="ene",db="postana",use_unicode=True,charset="utf8",port=0)
print " database connected success !"
cur = conn.cursor()
#delete data restored in this table before
cur.execute("delete from "+tablename)
conn.commit()
#prepare data and write into this table
insertvalues=[]
i=1
sum = 0
for value in dic.values():
sum += value
for key,value in dic.viewitems():
value = float(value)/sum
item = [i,key,value,tablename[0:-5]]
insertvalues.append(item)
item = []
i += 1
cur.executemany("insert into "+tablename+" values(%s,%s,%s,%s)",insertvalues)
conn.commit()
cur.close()
conn.close()
print "---Data have been written into ",tablename," ! ---"
except MySQLdb.Error,e:
print "MySQL Error %d: %s" % (e.args[0], e.args[1])
if __name__ == '__main__':
print "use main"
in2db(csdn,"csdntable")