-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathBaseline.py
84 lines (75 loc) · 2.4 KB
/
Baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
train = open('train.txt')
test = open('test.txt')
def containsAssoc(lst, ele):
for i in range(0,len(lst)):
tup = lst[i]
print(type(tup))
print(tup[0])
print(tup[1])
if ele == tup[0]:
return i
return -1
def preprocess(train):
lexicon = dict()
x = 1
while(True):
line = train.readline()
if not line:
break
if (x%3==1):
word_lst = line.split()
if (x%3==0):
ner_lst = line.split()
for i in range(0,len(word_lst)):
if(ner_lst[i] != 'O'):
if word_lst[i] not in lexicon:
lexicon[word_lst[i]] = {ner_lst[i] : 1}
else: #word has been found before
if ner_lst[i] not in lexicon[word_lst[i]]:
lexicon[word_lst[i]][ner_lst[i]] = 1
else:
lexicon[word_lst[i]][ner_lst[i]] = lexicon[word_lst[i]][ner_lst[i]] + 1
x=x+1
return lexicon
def createBaseline(lexicon):
for key in lexicon:
max_no = 0
for ner in lexicon[key]:
if lexicon[key][ner] > max_no:
max_no = lexicon[key][ner]
max_ner = ner
lexicon[key] = max_ner
return lexicon
def runTest(test, lexicon):
x = 1
per = "PER,"
loc = "LOC,"
org = "ORG,"
misc = "MISC,"
while(True):
line = test.readline()
if not line:
break
if (x%3==1):
word_lst = line.split()
if (x%3==0):
pos_lst = line.split()
for i in range(0,len(word_lst)):
word = word_lst[i]
pos = pos_lst[i]
if word in lexicon:
if "PER" in lexicon[word]:
per += (pos + "-" + pos + " ")
if "LOC" in lexicon[word]:
loc += (pos + "-" + pos + " ")
if "ORG" in lexicon[word]:
org += (pos + "-" + pos + " ")
if "MISC" in lexicon[word]:
misc += (pos + "-" + pos + " ")
x+=1
output = open('output.txt', 'w')
output.write(per + "\n" + loc + "\n" + org + "\n" + misc)
output.close()
baseline = createBaseline(preprocess(train))
runTest(test, baseline)