-
Notifications
You must be signed in to change notification settings - Fork 3
/
glove.py
43 lines (32 loc) · 1.42 KB
/
glove.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Loader for Glove embeddings, builds glove matrix from vocab."""
import numpy as np
from nlputils import raw_count
from tqdm import tqdm
class GloveLoader:
def __init__(self, glove_file):
self.glove_file = glove_file
def build_embeddings(self, vocab, factor=0.1):
"""
Builds an embedding matrix for all tokens in the vocabulary. Tokens for
which glove embeddings exist will be represented as glove embeddings. Missing tokens are
randomly initialized.
:param vocab: Vocab object from nlputils.py mapping words-->indices and visa versa.
:return: Numpy matrix (vocab_len, emb_size)
"""
embs = None
line_count = raw_count(self.glove_file)
num_embs_found = 0
with open(self.glove_file, 'r') as f:
for line in tqdm(f, total=line_count):
entries = line.strip().split()
word = entries[0]
embedding = entries[1:]
# we wait for first line to calculate embedding size, then build array
if embs is None:
embs = np.random.rand(len(vocab), len(embedding)) * factor
if word in vocab:
embedding = np.array(embedding)
embs[vocab[word]] = embedding
num_embs_found += 1
print('Fraction embeddings found: %s' % (num_embs_found / line_count))
return embs