-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfinal_test.py
155 lines (129 loc) · 3.54 KB
/
final_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import csv, sys, re
from nltk.corpus import stopwords
from textblob import TextBlob
csv.field_size_limit(sys.maxsize)
#open file that contains all papers and labels
def split_into_lemmas(message):
try:
message = message.encode('utf-8').lower()
except:
print type(message)
sys.exit()
words = TextBlob(message).words
# for each word, take its "base form" = lemma
stopWords = set(stopwords.words('english'))
wordsRaw = [word.lemma for word in words]
wordsOut = []
for word in wordsRaw:
if len(word) == 1:
continue
if word in stopWords:
continue
p = re.compile(r'\W')
check_digit = p.split(word)
digit = True
for i in check_digit:
if not i.isdigit():
digit = False
if digit:
continue
wordsOut.append(word)
return wordsOut
papers = {}
with open('MLpapers_lemma.csv','rb') as cf:
rd = csv.reader(cf, delimiter=',', quotechar='"')
header = rd.next()
for r in rd:
words_set = set(r[1].split(' '))
name = r[0]
label = r[2]
papers[name] = (words_set, label)
print 'tot paper #:', len(papers.keys())
'''
out = [['name','text','lable']]
for paper in papers.keys():
entry = [paper, ' '.join(papers[paper][0]), papers[paper][1]]
out.append(entry)
print 'len(out)', len(out)
with open('MLpapers_lemma.csv','wb') as cf:
wr = csv.writer(cf, delimiter=',', quotechar='"')
wr.writerows(out)
'''
'''
fh = open('checkWords.txt','rb')
checkWords0 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords0',len(checkWords0)
fh = open('checkWords1.txt','rb')
checkWords1 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords1',len(checkWords1)
fh = open('checkWords2.txt','rb')
checkWords2 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords2',len(checkWords2)
'''
fh = open('checkWords.txt','rb')
checkWords3 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords3',len(checkWords3)
'''
falls_p = []
falls_n = []
for paper in papers.keys():
predict3 = 'Non-data'
word_set = papers[paper][0]
label = papers[paper][1]
count = 0
for word in checkWords3:
if word in word_set:
count += 1
if count > 0:
if label != 'Data':
falls_p.append((paper,label))
else:
if label == 'Data':
falls_n.append((paper,label))
'''
paper_un = []
for paper in papers.keys():
word_set = papers[paper][0]
label = papers[paper][1]
data = False
for word in checkWords3:
if word in word_set:
data = True
break
if not data:
paper_un.append((paper, label))
#print len(paper_un)
#print 'falls_n',len(falls_n)
#print 'falls_p',len(falls_p)
#print [x[0] for x in falls_p]
papers_pdf = {}
with open('MLpapers_whole.csv','rb') as cf:
rd = csv.reader(cf, delimiter=',', quotechar='|')
header = rd.next()
for r in rd:
text = r[1]
name = r[0]
label = r[2]
papers_pdf[name] = (text, label)
print len(papers_pdf.keys())
phrase = [' Alexa', 'WHOIS','wallet','geolocation','Rockyou', 'yelp']
falls_p = []
falls_n = []
for paper, label in paper_un:
data = False
for word in phrase:
if word in papers_pdf[paper][0]:
data = True
break
if data:
if label != 'Data':
falls_p.append(paper)
else:
if label == 'Data':
falls_n.append(paper)
print 'falls_n',len(falls_n)
print 'falls_p',len(falls_p)