-
Notifications
You must be signed in to change notification settings - Fork 0
/
represent.py
114 lines (93 loc) · 3.27 KB
/
represent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pickle as pk
import numpy as np
from gensim.corpora import Dictionary
from util import flat_read
embed_len = 200
min_freq = 1
max_vocab = 5000
seq_len = 30
pad_ind, oov_ind = 0, 1
path_word_vec = 'feat/word_vec.pkl'
path_word_ind = 'feat/word_ind.pkl'
path_embed = 'feat/embed.pkl'
path_label_ind = 'feat/label_ind.pkl'
def tran_dict(word_inds, off):
off_word_inds = dict()
for word, ind in word_inds.items():
off_word_inds[word] = ind + off
return off_word_inds
def embed(sent_words, path_word_ind, path_word_vec, path_embed):
model = Dictionary(sent_words)
model.filter_extremes(no_below=min_freq, no_above=1.0, keep_n=max_vocab)
word_inds = model.token2id
word_inds = tran_dict(word_inds, off=2)
with open(path_word_ind, 'wb') as f:
pk.dump(word_inds, f)
with open(path_word_vec, 'rb') as f:
word_vecs = pk.load(f)
vocab = word_vecs.vocab
vocab_num = min(max_vocab + 2, len(word_inds) + 2)
embed_mat = np.zeros((vocab_num, embed_len))
for word, ind in word_inds.items():
if word in vocab:
if ind < max_vocab:
embed_mat[ind] = word_vecs[word]
with open(path_embed, 'wb') as f:
pk.dump(embed_mat, f)
def label2ind(labels, path_label_ind):
labels = sorted(list(set(labels)))
label_inds = dict()
for i in range(len(labels)):
label_inds[labels[i]] = i
with open(path_label_ind, 'wb') as f:
pk.dump(label_inds, f)
def sent2ind(words, word_inds, seq_len, keep_oov):
seq = list()
for word in words:
if word in word_inds:
seq.append(word_inds[word])
elif keep_oov:
seq.append(oov_ind)
if len(seq) < seq_len:
return [pad_ind] * (seq_len - len(seq)) + seq
else:
return seq[-seq_len:]
def align(sent_words, labels, path_sent, path_label):
with open(path_word_ind, 'rb') as f:
word_inds = pk.load(f)
pad_seqs = list()
for words in sent_words:
pad_seq = sent2ind(words, word_inds, seq_len, keep_oov=True)
pad_seqs.append(pad_seq)
pad_seqs = np.array(pad_seqs)
with open(path_label_ind, 'rb') as f:
label_inds = pk.load(f)
inds = list()
for label in labels:
inds.append(label_inds[label])
inds = np.array(inds)
with open(path_sent, 'wb') as f:
pk.dump(pad_seqs, f)
with open(path_label, 'wb') as f:
pk.dump(inds, f)
def vectorize(path_data, path_sent, path_label, mode):
sents = flat_read(path_data, 'text')
sent_words = [list(sent) for sent in sents]
labels = flat_read(path_data, 'label')
if mode == 'train':
embed(sent_words, path_word_ind, path_word_vec, path_embed)
label2ind(labels, path_label_ind)
align(sent_words, labels, path_sent, path_label)
if __name__ == '__main__':
path_data = 'data/train.csv'
path_sent = 'feat/sent_train.pkl'
path_label = 'feat/label_train.pkl'
vectorize(path_data, path_sent, path_label, 'train')
path_data = 'data/dev.csv'
path_sent = 'feat/sent_dev.pkl'
path_label = 'feat/label_dev.pkl'
vectorize(path_data, path_sent, path_label, 'dev')
path_data = 'data/test.csv'
path_sent = 'feat/sent_test.pkl'
path_label = 'feat/label_test.pkl'
vectorize(path_data, path_sent, path_label, 'test')