-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfinal_test2.py
106 lines (90 loc) · 2.41 KB
/
final_test2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import csv, sys, re
from nltk.corpus import stopwords
from textblob import TextBlob
csv.field_size_limit(sys.maxsize)
#open file that contains all papers and labels
papers = {}
with open('papers400_lemma.csv','rb') as cf:
rd = csv.reader(cf, delimiter=',', quotechar='"')
header = rd.next()
for r in rd:
words_set = set(r[1].split(' '))
name = r[0]
label = r[2]
papers[name] = (words_set, label)
print 'tot paper #:', len(papers.keys())
'''
out = [['name','text','lable']]
for paper in papers.keys():
entry = [paper, ' '.join(papers[paper][0]), papers[paper][1]]
out.append(entry)
print 'len(out)', len(out)
with open('papers400_lemma.csv','wb') as cf:
wr = csv.writer(cf, delimiter=',', quotechar='"')
wr.writerows(out)
'''
#open checkWords file
'''
fh = open('checkWords1.txt','rb')
checkWords1 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords1',len(checkWords1)
fh = open('checkWords2.txt','rb')
checkWords2 = set(fh.readline().split(' '))
fh.close()
print '# of checkWords1',len(checkWords1)
'''
fh = open('checkWords1s.txt','rb')
checkWords = set(fh.readline().split(' '))
fh.close()
print '# of checkWords',len(checkWords)
falls_p = []
falls_n = []
for paper in papers.keys():
predict3 = 'Non-data'
word_set = papers[paper][0]
label = papers[paper][1]
count = 0
for word in checkWords:
if word in word_set:
count += 1
if count > 1:
if label != 'Data':
falls_p.append((paper,label))
else:
if label == 'Data':
falls_n.append((paper,label))
phrase = []
'''
for paper in papers.keys():
predict1 = 'Non-data'
word_set = papers[paper][0]
label = papers[paper][1]
for w in phrase:
if w in word_set:
predict1 = 'Data'
break
for word in checkWords:
if word in word_set:
predict1 = 'Data'
break
if predict1 != label:
if label == 'Data':
falls_n.append((paper,label))
else:
falls_p.append((paper,label))
'''
print 'falls_n',len(falls_n)
print 'falls_p',len(falls_p)
#print [x[0] for x in falls_p]
'''
papers_pdf = {}
with open('papers400_whole.csv','rb') as cf:
rd = csv.reader(cf, delimiter=',', quotechar='|')
header = rd.next()
for r in rd:
text = r[1]
name = r[0]
label = r[2]
papers_pdf[name] = (text, label)
'''