-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathprocess_embedding.py
58 lines (47 loc) · 1.55 KB
/
process_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import pickle as pkl
import os
# Load & process GloVe
data_dir = 'data/'
glove = 'glove.840B.300d'
if not os.path.exists(data_dir + 'processed/' + glove + '.dic.npy'):
print 'Reading original Glove file...'
f = open(data_dir + glove + '.txt')
lines = f.readlines()
f.close()
print 'Processing original Glove file to dictionary...\n'
embedding = dict()
for line in lines:
splited = line.split()
embedding[splited[0]] = map(float, splited[1:])
# Save Glove as dic file
np.save(data_dir + 'processed/' + glove + '.dic.npy', embedding)
else:
print 'Glove dictionary exists!'
print 'Loading Glove dictionary...\n'
embedding = np.load(data_dir + 'processed/' + glove + '.dic.npy').item()
# Make pre-trianed embedding with GloVe
print 'Generate pre-trained embedding with Glove'
with open('data/processed/vocab_xinyadu.dic') as f:
vocab = pkl.load(f)
embedding_vocab = np.tile(embedding['UNKNOWN'],[len(vocab),1])
'''
vocab['<PAD>'] = 0
vocab['<GO>'] = 1
vocab['<EOS>'] = 2
vocab['<UNK>'] = 3
'''
embedding_vocab[0] = 0.0 # vocab['<PAD>'] = 1
embedding_vocab[1] = embedding['<s>']
embedding_vocab[2] = embedding['EOS']
embedding_vocab[3] = embedding['UNKNOWN']
unk_num = 0
for word, idx in vocab.items():
if word in embedding:
embedding_vocab[idx] = embedding[word]
else:
unk_num += 1
np.save('data/processed/glove_embedding.npy', embedding_vocab)
# check how many unknown words
print 'vocab size : %d' %len(embedding_vocab)
print 'unknown word size : %d' %unk_num