-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2diavec.py
190 lines (157 loc) · 7.94 KB
/
word2diavec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import argparse
import re
import vector_training
from gensim.models import FastText, Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
parser = argparse.ArgumentParser()
# Preload saved models, or train some from existing tweet data
parser.add_argument('-fasttext', '--ft', action="store", dest="ft", type=str, default="fasttext.model")
parser.add_argument('-word2vec', '--w2v', action="store", dest="w2v", type=str, default="word2vec.model")
parser.add_argument('-tweet_csv', '--csv', action="store", dest="t", type=str, default="tweet_data.tsv")
parser.add_argument('-load', '--l', action="store_true", dest="load")
parser.add_argument('-text_output', '--out', action="store", dest="outtxt", type=str, default="text_output.txt")
parser.add_argument('-text_input', '--in', action="store", dest="intxt", type=str, default="text_input.txt")
parser.add_argument('-analogies', '--an', action="store", dest="analogies", type=str, default="word-test.v1.txt")
parser.add_argument('-explore', '--exp', action="store_true", dest="expl")
parser = parser.parse_args()
def cosines(model1, model2):
"""
Takes cosine similarities between identical word pairs in two vector space models,
then averages the results together and exports them to a .txt file.
Requires two models trained on the same data (and thus having the same vocab).
:param model1: KeyedVector object to test
:param model2: other KeyedVector object to test
:return: None
"""
vector_similarity = {}
vector_average = 0
for word in model1.vocab:
vector_similarity[word] = cosine_similarity(model1[word].reshape(1, -1), model2[word].reshape(1, -1))[0][0]
vector_average += vector_similarity[word]
vector_average = vector_average / len(model1.vectors)
print(f'Done with cosine similarity of {len(model1.vocab)} tokens; average {vector_average}')
return vector_average
def most_similar(model1, model2, toks):
"""
Calculates the 3 most similar words for a given token in two models, then returns a list of dicts of cosine
similarity between models for the token + the most similar metric. For qualitative analysis.
:param model1: KeyedVector object to test
:param model2: other KeyedVector object to test
:param toks: list of tokens to compare between models
:return: dict of tokens with cosine similarities and most similar words
"""
tok_dict = {}
for p in toks:
ft_sim = model1.most_similar(p)
w2v_sim = model2.most_similar(p)
tok_dict[p] = (cosine_similarity(model1[p].reshape(1, -1), model2[p].reshape(1, -1))[0][0],
ft_sim[0:3], w2v_sim[0:3])
return tok_dict
def linzen_tests(analogies, model, exploratory=False):
"""
This function has two modes, regular and exploratory.
Regular:
Following Linzen (2016), "Issues in evaluating semantic spaces using word analogies,"
this function uses "vanilla," "only-b," and "ignore-a" word analogy metrics to measure semantic space accuracy
by finding the "correct" analogy offset.
- Vanilla: literal offset method (a:a'::b:__)
- Only-B: returns the nearest neighbor of b
- Ignore-A: returns the word most similar to a' and b (a' + b)
Cosine similarities are calculated for each metric on a four-word analogy pair (e.g., cat:cats::dog:dogs).
They then are compared with the given b, and given a point if accurate.
The function returns their total points divided by number of analogies.
Exploratory:
Runs the same Linzen metrics as Regular, but instead returns a list of dictionaries of the model's answers.
Allows for the discovery of what the model "thinks" a word belongs to, such as yall:[placename].
Activate with exploratory=True.
:param analogies: list of strings of analogies to test, in the order [a, a', b, __].
:param model: vector space model to be tested
:param exploratory: toggles exploratory mode. Defaults to False
:return: list of accuracy scores for vanilla, only-b, and ignore-a offsets;
in exploratory mode, list of dicts of tuples of analogy answers
"""
count = [0, 0, 0]
explore = []
for i, tok in enumerate(analogies):
if not any([t not in model.vocab for t in tok]): # avoids KeyErrors
a, ap, b, xp = tok
if not exploratory:
if model.most_similar(positive=[ap, b], negative=a)[0][0] == xp: # vanilla
count[0] += 1
if model.most_similar(b)[0][0] == xp: # only-b
count[1] += 1
if model.most_similar(positive=[ap, b])[0][0] == xp: # ignore-a
count[2] += 1
else:
explore_dict = {f'{a} : {ap} :: {b} : ': (model.most_similar(positive=[ap, b], negative=a)[0][0],
model.most_similar(b)[0][0],
model.most_similar(positive=[ap, b])[0][0])}
explore.append(explore_dict)
if i % 500 == 0:
print(f'{i+1} / {len(analogies)} Linzen metrics done')
if not exploratory:
try:
scores = [num / len(analogies) for num in count]
except ZeroDivisionError:
print("No analogies found.")
scores = 0
return scores
else:
return explore
def analogy_parse(analogies, start="", end=""):
"""
Reads and listifies analogy sets in the form "a:a::b:b'" where the colons are spaces or tabs.
Allows users to specify where to begin and end in the file in case of colon-delimited sections of interests,
e.g. at ": city-to-state" in the Miklov et al. (2013) data set.
:param analogies: path to text file containing line-separated analogy sets.
:param start: string for section to begin on
:param end: string to end parsing
:return: list of lists of strings of tweets
"""
analogies_list = []
with open(analogies, 'r') as analogies:
use = False
if start == "":
use = True
for a in analogies:
if start == a.strip():
use = True # skips to metrics relevant for data
elif end == a.strip() and end != "":
break
if use:
if ':' in a:
continue # avoid section headers
else:
a = re.sub(' ', '\t', a.strip()).lower() # normalize data
analogies_list.append(a.split('\t'))
return analogies_list
def main(load_bool):
"""
:param load_bool: flag to determine whether to open models from provided paths
or to train new ones using vector_training.
"""
if not load_bool:
vector_training.main(parser.t) # will save to default paths
ft = FastText.load("fasttext.model").wv
w2v = Word2Vec.load("word2vec.model").wv
print("Hello words! Models trained and loaded.")
else:
ft = FastText.load(parser.ft).wv
w2v = Word2Vec.load(parser.w2v).wv
print("Hello words! Models loaded.")
analogies_list = analogy_parse(parser.analogies, start="", end="")
ft_scores = linzen_tests(analogies_list, ft, exploratory=parser.expl)
w2v_scores = linzen_tests(analogies_list, w2v, exploratory=parser.expl)
with open(parser.intxt, 'r') as inp:
pronoun_list = [i.strip() for i in inp]
top_pronouns = most_similar(ft, w2v, pronoun_list)
with open(parser.outtxt, 'w') as out:
out.write(f'Linzen scores\texplore={parser.expl}\n' + '='*25)
out.write(f'\nFT: {ft_scores}\t\tw2v: {w2v_scores}\n\n\n\n')
out.write(f'Token similarity\n' + '='*25)
[out.write(f'\n{p}: {top_pronouns[p][0]}\n'
f'\t{top_pronouns[p][1]}'
f'\n\t{top_pronouns[p][2]}') for p in top_pronouns]
print(f'Data saved to {out.name}')
if __name__ == '__main__':
main(parser.load)