-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
338 lines (285 loc) · 13.2 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import random
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset
import os
import json
import torch
from gensim.models.doc2vec import Doc2Vec
def generate_candidate(file_path):
train_candidate = dict()
test_candidate = dict()
fin = open(file_path, 'r', encoding='UTF-8')
line = fin.readline().strip()
while line:
id = line
line = fin.readline().strip()
all_candidates = list(map(int, line.split(' ')))
train_candidate[int(id)] = np.random.choice(a=all_candidates, size=2000, replace=True, p=None)
test_candidate[int(id)] = all_candidates
line = fin.readline().strip()
return train_candidate, test_candidate
def get_news_map_doc2vec(item_map, senK, normal, content_word_f, content_word_w, doc2vec_model, hidden_dim):
model_dm = Doc2Vec.load(doc2vec_model)
if os.path.exists(content_word_w):
fr = open(content_word_w, 'r')
tmp = json.load(fr)
newsMap = {}
for key, value in tmp.items():
newsMap[int(key)] = np.asarray(value)
print("newsMap size:" + str(len(newsMap)))
return newsMap
else:
newsMap = {} # news id (int) -> matrix(20 * 60)
newsMap_2 = {} # news id (int) -> matrix(20 * 60)
article_in = open(content_word_f, 'r', encoding='UTF-8')
line = article_in.readline()
while line:
id = line.strip()
if id in item_map:
newsNumber = item_map[id]
news_mat = np.zeros((senK, hidden_dim), dtype=np.float32) # 每个news的matric 20 * 64
line = article_in.readline().strip()
if line == "": # news of no words
newsMap[newsNumber] = news_mat
newsMap_2[newsNumber] = news_mat.tolist()
else:
line = line[:-2]
sentences = line.split("||")
for (i,sen) in enumerate(sentences[:senK]):
sen_embed = model_dm.docvecs[str(id)+'-'+str(i)]
news_mat[i] += sen_embed
length = len(sentences)
if length < senK: # pad to 20 on the right, mean value is used
temp = np.sum(news_mat[:length],axis=0) / length
for k in range(length,senK):
news_mat[k] += temp
if normal=="meanstd":
for sen in news_mat:
sen -= np.mean(sen, axis=0)
sen /= np.std(sen,axis = 0)
elif normal=="minmax":
for (i, sen) in enumerate(news_mat):
x_max = np.max(sen)
x_min = np.min(sen)
news_mat[i] = (sen - x_min) / (x_max - x_min)
else:
pass
newsMap[newsNumber] = news_mat
newsMap_2[newsNumber] = news_mat.tolist()
line = article_in.readline()
else:
line = article_in.readline()
line = article_in.readline()
article_in.close()
print("newsMap size:"+str(len(newsMap)))
fw = open(content_word_w, 'w', encoding='utf-8')
json.dump(newsMap_2, fw)
return newsMap
def get_elements(item_map, element_f, element_root_w):
if os.path.exists(element_root_w):
fr = open(element_root_w, 'r')
tmp = json.load(fr)
newsMap = {}
for key, value in tmp.items():
newsMap[int(key)] = np.asarray(value, dtype=np.float32)
print("newsMap(element) size:" + str(len(newsMap)))
return newsMap
else:
newsMap = {} # news id(int)--> matric(4 * 64) [time, person|organ, location, keywords]
newsMap_2 = {} # news id(int)--> matric(4 * 64) [time, person|organ, location, keywords]
element_embed = open(element_f, 'r', encoding='utf-8').readlines()
line_num = len(element_embed)
i = 0
while 8 * i < line_num:
if i % 10000 == 0:
print(i)
one_line = element_embed[8 * i: 8 * (i + 1)]
element_dic = {}
for ele in one_line:
ele = ele.strip().split(':')
element_dic[ele[0]] = ele[1]
id = element_dic['id']
time = element_dic['time']
per = element_dic['per']
organ = element_dic['organ']
loc = element_dic['loc']
keywords = element_dic['keywords']
all = element_dic['all']
if id in item_map:
f = False
newsNumber = item_map[id]
no_entity_set = set()
news_mat = np.zeros((5, 64), dtype=np.float32)
if time != '':
f = True
time = list(map(float, time.split(' ')))
news_mat[0] += time
else:
no_entity_set.add(0)
if per != '':
f = True
person = list(map(float, per.split(' ')))
news_mat[1] += person
else:
no_entity_set.add(1)
if organ != '':
f = True
organ = list(map(float, organ.split(' ')))
news_mat[2] += organ
else:
no_entity_set.add(2)
if loc != '':
f = True
location = list(map(float, loc.split(' ')))
news_mat[3] += location
else:
no_entity_set.add(3)
if keywords != '':
f = True
keywords = list(map(float, keywords.split(' ')))
news_mat[4] += keywords
else:
no_entity_set.add(4)
if all != '':
f = True
all_entity = list(map(float, all.split(' ')))
for no in no_entity_set:
news_mat[no] += all_entity
if f: # If not all are 0, regularization is performed
for sen in news_mat:
sen -= np.mean(sen, axis=0)
sen /= np.std(sen, axis=0)
else: # If all are 0, set to 1
news_mat = np.ones((5, 64), dtype=np.float32)
print('have no entity'+str(id))
newsMap[newsNumber] = news_mat
newsMap_2[newsNumber] = news_mat.tolist()
else:
continue
i += 1
print("newsMap(element) size:" + str(len(newsMap)))
fw = open(element_root_w, 'w', encoding='utf-8')
json.dump(newsMap_2, fw)
return newsMap
N = 50 # this is used for DNS. 2. DNS: random sample 50 samples, and then use DNS to sample neg_sample
class ReadingNEWS(Dataset):
def __init__(self, args, sequences, targets, targets_time, newsMap=None,
elementsMap=None, usercandidate=None, negs=3, senK=20):
self.args = args
self.data_hist = sequences.sequences
self.data_hist_time = sequences.sequences_time
self.data_cand = targets
self.data_cand_time = targets_time
self.data_user = sequences.user_ids
self.newsMap = newsMap
self.elementsMap = elementsMap
self.usercandidate = usercandidate
self.negs = negs
self.senK = senK
def __getitem__(self, index): # 这个index
history = self.data_hist[index] # 长度L
hitory_time = self.data_hist_time[index] # 长度L
candidate = self.data_cand[index] # 长度1
cadidate_time = self.data_cand_time[index] # 长度1
user = self.data_user[index] # 长度1
####### 历史新闻内容向量 & 历史新闻元素向量 #######
news_hist = np.zeros((history.shape[0], self.senK, 64), dtype=np.float32) # 3 D :L * 20 * 64
news_element_hist = np.zeros((history.shape[0], 5, 64), dtype=np.float32) # 3 D :L * 4 * 64
for (i, item) in enumerate(history):
news = self.newsMap[item]
news_hist[i,:,:] += news
news_element = self.elementsMap[item]
news_element_hist[i,:,:] += news_element
news_hist = torch.from_numpy(news_hist)
news_element_hist = torch.from_numpy(news_element_hist)
######## 候选新闻内容向量 & 候选新闻元素向量 ########
news = self.newsMap[candidate]
news_cand = np.mean(news, axis=0, dtype=np.float32)
news_cand = torch.from_numpy(news_cand)
news_element_cand = self.elementsMap[candidate]
news_element_cand = torch.from_numpy(news_element_cand)
####### 候选负样本新闻内容向量 & 负样本新闻元素向量 #########
news_neg_can = np.zeros((N, 64), dtype=np.float32) # 2D : 3 * 64
news_element_neg_can = np.zeros((N, 64), dtype=np.float32) # 2D : 3 * 64
news_neg_id_can = np.zeros(N, dtype=np.int64)
x = self.usercandidate[user]
items = np.random.choice(a=x, size=N, replace=True, p=None)
for i, item in enumerate(items):
news_neg_id_can[i] += item
news = self.newsMap[item]
news_temp = np.mean(news, axis=0)
news_neg_can[i] += news_temp
news_element_temp = np.mean(self.elementsMap[item], axis=0)
news_element_neg_can[i] += news_element_temp
news_neg_can = torch.from_numpy(news_neg_can)
news_element_neg_can = torch.from_numpy(news_element_neg_can)
news_neg_id_can = torch.from_numpy(news_neg_id_can)
return (news_hist, news_element_hist, history, news_cand, news_element_cand, candidate, \
user, hitory_time, cadidate_time, \
news_neg_id_can, news_neg_can, news_element_neg_can)
def __len__(self):
return len(self.data_hist)
class ReadingNEWSTest(Dataset):
def __init__(self, args, sequences, targets, targets_time, newsMap=None,
elementsMap=None, usercandidate=None, negs=3, senK=20):
self.args = args
self.data_hist = sequences.sequences
self.data_hist_time = sequences.sequences_time
self.data_cand = targets
self.data_cand_time = targets_time
self.data_user = sequences.user_ids
self.newsMap = newsMap
self.elementsMap = elementsMap
self.usercandidate = usercandidate
self.negs = negs
self.senK = senK
def __getitem__(self, index): # 这个index
history = self.data_hist[index] # 长度L
hitory_time = self.data_hist_time[index] # 长度L
candidate = self.data_cand[index] # 长度1
cadidate_time = self.data_cand_time[index] # 长度1
user = self.data_user[index] # 长度1
####### 历史新闻内容向量 & 历史新闻元素向量 #######
news_hist = np.zeros((history.shape[0], self.senK, 64), dtype=np.float32) # 3 D :L * 20 * 64
news_element_hist = np.zeros((history.shape[0], 5, 64), dtype=np.float32) # 3 D :L * 4 * 64
for (i, item) in enumerate(history):
news = self.newsMap[item]
news_hist[i,:,:] += news
news_element = self.elementsMap[item]
news_element_hist[i,:,:] += news_element
news_hist = torch.from_numpy(news_hist)
news_element_hist = torch.from_numpy(news_element_hist)
######## 候选新闻内容向量 & 候选新闻元素向量 ########
news_cand = np.zeros(64, dtype=np.float32) # 1D : 64
news = self.newsMap[candidate]
for sen in news:
news_cand += sen
news_cand = news_cand/len(news)
news_cand = torch.from_numpy(news_cand)
news_element_cand = self.elementsMap[candidate]
news_element_cand = torch.from_numpy(news_element_cand)
####### negative news item representation, and its element rep
news_neg = np.zeros((self.negs, 64), dtype=np.float32) # 2D : 3 * 64
news_element_neg = np.zeros((self.negs, 5, 64), dtype=np.float32) # 2D : 3 * 64
news_neg_id = np.zeros(self.negs, dtype=np.int64)
x = self.usercandidate[user]
items = np.random.choice(a=x, size=self.negs, replace=True, p=None)
for i, item in enumerate(items):
news_neg_id[i] += item
news = self.newsMap[item]
sen_sum = np.zeros(64, dtype=np.float32)
for sen in news:
sen_sum += sen
news_temp = sen_sum/len(news)
news_neg[i] += news_temp
news_element_temp = self.elementsMap[item]
news_element_neg[i] += news_element_temp
news_neg = torch.from_numpy(news_neg)
news_element_neg = torch.from_numpy(news_element_neg)
news_neg_id = torch.from_numpy(news_neg_id)
return (news_hist, news_element_hist, history, news_cand, news_element_cand, candidate, \
user, hitory_time, cadidate_time,
news_neg_id, news_neg, news_element_neg)
def __len__(self):
return len(self.data_hist)