-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_word_concept_topsim.py
executable file
·156 lines (119 loc) · 3.8 KB
/
check_word_concept_topsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/python
# produces the file in the format
# ["word", [["concept", sim], ...]]
# using word-concept-topsim-no-parent-0.60-100
# the next step is to cluster the concept lists for each word to decide
# when multiple-inheritence should be used
import features
import operator
import pdb
import sys
import simplejson as json
import os
from nltk.corpus import wordnet as nlwn
import sqlite3
from make_pmi_db import load_concepts_mem_db
from make_pmi_db import load_words_mem_db
import featuremap
import math
from collections import defaultdict
words = set()
names = set()
fin = open('words-no-adverbs')
for line in fin:
word = line.strip()
words.add(word)
fin.close()
fin = open('proper_names')
for line in fin:
name = line.strip()
names.add(name)
fin.close()
def word_concept_sim(word, concept):
synset = nlwn.synset(concept)
is_similar = False
max_sim = 0.0
for s in nlwn.synsets(word, pos='n'):
simval = s.lin_similarity(synset, features.ic)
if simval > max_sim:
max_sim = simval
return max_sim
def is_too_sim(word, concept, t=0.30):
for stem in nlwn._morphy(word, pos='n'):
s = word_concept_sim(stem, concept)
if s > t:
return True
synset = nlwn.synset(concept)
for ss in nlwn.synsets(stem, pos='n'):
if synset in features.all_hypernyms(ss):
return True
return False
def show_all(word_concept_topsim_filename):
fin = open(word_concept_topsim_filename)
prev_chosen = set()
word_concept_dict = defaultdict(list)
for line_num, line in enumerate(fin):
if line_num in [0,1] and line.startswith('loaded'):
continue
word, concept_sim_list = json.loads(line)
if word not in words:
continue
# throw out words that have instance hypernyms in WordNet
has_instance_hyper = False
has_upper = False
for synset in nlwn.synsets(word, pos='n'):
if synset.instance_hypernyms():
has_instance_hyper = True
break
for lemma in synset.lemmas():
if lemma.name()[0].isupper() and lemma.name()[1:].islower() and lemma.name()[1:].isalpha():
has_upper = True
if has_instance_hyper:
continue
if has_upper:
continue
concept, top_s = concept_sim_list[0]
is_sim = is_too_sim(word, concept)
if (word,concept) not in prev_chosen:
if top_s > 0.067 and not is_sim:
#if not is_sim:
# print '*', word, ':', concept, '--', nlwn.synset(concept).definition
#else:
# print word, ':', concept, '--', nlwn.synset(concept).definition
word_concept_dict[word].append((concept, top_s))
prev_chosen.add((word,concept))
for concept, top_s in concept_sim_list[1:]:
if (word,concept) not in prev_chosen:
is_sim = is_too_sim(word, concept)
if top_s > 0.11 and not is_sim:
#if not is_sim:
# print '*', word, ':', concept, '--', nlwn.synset(concept).definition
#else:
# print word, ':', concept, '--', nlwn.synset(concept).definition
prev_chosen.add((word,concept))
word_concept_dict[word].append((concept, top_s))
for word in word_concept_dict:
concept_list = word_concept_dict[word]
concept_list = filter_superconcept_dupes(concept_list)
for concept, top_s in concept_list:
print word, ':', concept, '--', nlwn.synset(concept).definition()
#print json.dumps( (word, concept_list) )
def filter_superconcept_dupes(concept_list):
concept_list.sort(key=operator.itemgetter(1), reverse=True)
new_concept_list = []
for concept, s in concept_list:
synset = nlwn.synset(concept)
super_exists = False
for c2, s2 in new_concept_list:
c2_synset = nlwn.synset(c2)
if c2_synset in features.all_hypernyms(synset) or synset in features.all_hypernyms(c2_synset):
super_exists = True
break
if not super_exists:
new_concept_list.append( (concept, s) )
return new_concept_list
def main():
word_concept_topsim_filename = sys.argv[1]
show_all(word_concept_topsim_filename)
if __name__ == "__main__":
main()