-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalcIDF.py
95 lines (72 loc) · 1.86 KB
/
calcIDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
#use othersites.txt as the text reporsitory
import collections
import math
import MySQLdb
def read(filename):
arr = []
of=open(filename,'r')
linesarr=of.readlines()
for line in linesarr:
line=line.strip()
pos=line.find(':')
line=line[pos+1:]
if(len(line)>100):
#print len(line)
line=line.split(',')
arr.append(line)
#print line[0:5]
of.close()
return arr
arr=read('othersites.txt')
wset=set([])
for page in arr:
wset=set(page)|wset
#print wset
dic={}
for word in wset:
dic[word]=0
for page in arr:
page = set(page)
for word in wset:
if word in page:
dic[word]+=1
print word," : ",dic[word]
# sort the dic based on value
#dic = collections.OrderedDict(sorted(dic.items(), key = lambda t: -t[1]))
# get IDF of each word
for key,value in dic.viewitems():
dic[key]=math.log(len(arr)/(value+1))
# IDF-dic to Database
def idf2DB(dic,tablename):
print "tablename : ",tablename
dic = collections.OrderedDict(sorted(dic.items(), key = lambda t: -t[1]))
try:
conn = MySQLdb.connect(host="localhost",user="root",passwd="ene",db="postana",use_unicode=True,charset="utf8",port=0)
print " database connected success !"
cur = conn.cursor()
#delete data restored in this table before
cur.execute("delete from "+tablename)
conn.commit()
#prepare data and write into this table
insertvalues=[]
i=1
for key,value in dic.viewitems():
item = [i,key,value,"othersites"]
insertvalues.append(item)
item = []
i += 1
cur.executemany("insert into "+tablename+" values(%s,%s,%s,%s)",insertvalues)
conn.commit()
cur.close()
conn.close()
print "---Data have been written into ",tablename," ! ---"
except MySQLdb.Error,e:
print "MySQL Error %d: %s" % (e.args[0], e.args[1])
idf2DB(dic,"idftable")
'''
fo = open('out.txt','w')
for arri in arr:
fo.write(arri+'\n')
fo.close()
'''