-
Notifications
You must be signed in to change notification settings - Fork 0
/
recPipeline.py
365 lines (288 loc) · 12.7 KB
/
recPipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
"""
author: Colin Bradley
last updated: 03/02/2021
This script is the lightFM version of the recommender system. It is deprecated.
"""
#standard stuff
import numpy as np
import pandas as pd
# model imports
import random
from scipy.sparse import coo_matrix as cm
import lightfm as lf
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
# table formatting
from IPython.display import display, HTML
# for progress bars
from tqdm import tqdm
class LFpipeline():
def __init__(self):
self.books = pd.DataFrame()
self.ratings = pd.DataFrame()
self.book_tags = pd.DataFrame()
self.tags = pd.DataFrame()
self.to_read = pd.DataFrame()
def preprocess(self):
'''
This function is just a clean way to call all preprocessing steps. These steps
include reading in the goodbooks-10k data, fixing some book id's and more.
input:
self - just class object
output:
null - no need to return anything
'''
self.read_data()
self.fix_ids()
def read_data(self):
'''
This function just reads in the goodbooks-10k data. Later should have the
functionality to read in scraped data.
input:
self - class object
output:
null - no need to return anything
'''
self.books = pd.read_csv("goodbooks-10k/books.csv")
self.ratings = pd.read_csv("goodbooks-10k/ratings.csv")
# may not use these at first.
self.book_tags = pd.read_csv("goodbooks-10k/book_tags.csv")
self.tags = pd.read_csv("goodbooks-10k/tags.csv")
self.to_read = pd.read_csv("goodbooks-10k/to_read.csv")
def fix_ids(self):
'''
This function sets the bookand user id's to start at zero. It also changes the name of
headers from user_id and book_id to uid and iid.
input:
self - class object
output:
null - no need to return anything
'''
# just changing to standard feature names
self.ratings.rename(
columns={'user_id': 'uid', 'book_id': 'iid'}, inplace=True)
self.books.rename(columns={'book_id': 'iid'}, inplace=True)
self.to_read.rename(
columns={'user_id': 'uid', 'book_id': 'iid'}, inplace=True)
# starting user and book indices from 0
self.ratings.uid = self.ratings.uid - 1
self.ratings.iid = self.ratings.iid - 1
self.books.iid = self.books.iid - 1
self.to_read.iid = self.to_read.iid - 1
# this makes all the tags indexed by book_id instead of goodreads_book_id
temp_books = self.books.set_index('goodreads_book_id')
idMAP = temp_books['iid'].to_dict()
self.book_tags['iid'] = self.book_tags.goodreads_book_id.map(idMAP)
self.book_tags.drop('goodreads_book_id', inplace=True,axis = 1)
def get_interactions(self, ratingsThresh = 0):
'''
This function gets all of the interactions sparse matrix using a scipy sparse coo_matrix.
Parameters
----------
ratingsThresh : int, optional
If you only want to include ratings over a certain number. The default is 0.
Returns
-------
ratSparse : TYPE
DESCRIPTION.
'''
# only look at ratings above ratingsThresh
self.ratings = self.ratings.query('rating >= @ratingsThresh')
numUsers = self.ratings.uid.max()+1
numBooks = self.ratings.iid.max()+1
ratSparse = cm((self.ratings.rating, (self.ratings.uid, self.ratings.iid)),
shape=(numUsers, numBooks))
return ratSparse
def split_test_train(self,
testTrainFrac = 0.5,
ratingsWithheldFrac = 0.4,
testValFrac = 0.5,
ratingsThresh = 0):
'''
This function takes in the entire ratings dataset and splits the interaction matrices into a training, testing and validation set.
First the function splits off a portion of reviews that are then split up into test and validation sets. This function is different
from scikit-learn in that it preserves all user-ids in the training set to avoid a cold start problem when testing your algorithm.
Parameters
----------
testTrainFrac : flaot, optional
What percentage of the users would you like to test/validate on. The default is 0.5.
ratingsWithheldFrac : float, optional
What percentage of the ratings of the test/val users would you like to withhold. The default is 0.4.
testValFrac : float, optional
Fraction to split test into test/validation. The default is 0.5.
ratingsThresh : int, optional
If you only want to include ratings over a certain number. The default is 0.
Returns
-------
sparseTrain : scipy coo matrix
A sparse matrix of the training user-item interactions.
sparseTest : scipy coo matrix
A sparse matrix of the testing user-item interactions.
sparseVal : scipy coo matrix
A sparse matrix of the validation user-item interactions.
'''
import random
random.seed(1001)
# only look at ratings above ratingsThresh
self.ratings = self.ratings.query('rating >= @ratingsThresh')
# let's get the list of users for the test set
uids = self.ratings.uid.unique().tolist()
test_uids = random.sample(uids, k = int(len(uids)*testTrainFrac))
# get all data for test users and put the rest in training
test_users = self.ratings.query('uid == @test_uids')
train = self.ratings.query('uid != @test_uids')
# only consider 40% of test users ratings and put the rest in the train set.
# This prevents the cold start problem on the test set. We will later incorporate
# new users.
test = test_users.groupby('uid').sample(frac = ratingsWithheldFrac, random_state = 1)
validation = test.groupby('uid').sample(frac = testValFrac, random_state = 888)
train = train.append(test_users.drop(test.index), ignore_index = True)
test.drop(validation.index,inplace = True)
# get these into sparse matrices
trainNumUsers = train.uid.max()+1
trainNumBooks = train.iid.max()+1
sparseTrain = cm((train.rating, (train.uid, train.iid)),
shape=(trainNumUsers, trainNumBooks))
testNumUsers = test.uid.max()+1
testNumBooks = test.iid.max()+1
sparseTest = cm((test.rating, (test.uid, test.iid)),
shape=(testNumUsers, testNumBooks))
valNumUsers = validation.uid.max()+1
valNumBooks = validation.iid.max()+1
sparseVal = cm((validation.rating, (validation.uid, validation.iid)),
shape=(valNumUsers, valNumBooks))
return sparseTrain, sparseTest, sparseVal
# def get_meta(self):
# numUsers = self.ratings.uid.max()+1
# numBooks = self.ratings.iid.max()+1
# numUserFeatures = self.to_read.book_id.max+1
# numItemFeatures = self.tags.book_id.max+1
class LFrecommender():
def __init__(self):
self.model = lf.LightFM(loss = 'warp')
def fit(self,
interactions,
no_components = 10,
learning_schedule = 'adagrad',
loss='warp',
learning_rate = 0.05,
item_alpha = 0.0,
user_alpha = 0.0,
max_sampled = 10,
epochs=20,
num_threads=2):
'''
this function trains our SVD model.
Parameters
----------
interactions : coo_sparse matrix
sparse user-items interaction training matrix
rest : see LightFM documentation.
https://making.lyst.com/lightfm/docs/lightfm.html
Returns
-------
None.
'''
# user_meta, item_meta = self.get_meta()
self.model = lf.LightFM(no_components = no_components,
learning_schedule = learning_schedule,
loss=loss,
learning_rate = learning_rate,
item_alpha = item_alpha,
user_alpha = user_alpha,
max_sampled = max_sampled)
self.model.fit(interactions, epochs=epochs, num_threads=num_threads)
def paramSearch(self, train, test, num_samples = 10, num_threads = 2):
'''
This function trains models on various sets of random hyperparameters
and returns a dataframe with scores.
Parameters
----------
interactions : coo_sparse matrix
sparse user-items interaction training matrix
num_samples : int
How many random samples do you want to pull? The default is 10
num_threads : int
number of threads to parallelize over. the default is 2.
Returns
-------
hyperparameters with metric scores.
'''
hyperparams = pd.DataFrame()
print('Searching for optimal parameters...')
for i in tqdm(range(num_samples)):
# get a random sample of hyperparameters
params = sample_hyperparameters()
# train a model using those hyperparameters
self.fit(train, **params, num_threads = num_threads)
prec, rec, auc = self.score(test = test, trian = train, num_threads = num_threads)
params['precision_mean'] = prec.mean()
params['recall_mean'] = rec.mean()
params['auc_mean'] = auc.mean()
hyperparams = hyperparams.append(params, ignore_index=True).sort_values(by = 'precision_mean')
return hyperparams
def score(self, test, train, num_threads = 2):
prec = precision_at_k(self.model,
test_interactions = test,
train_interactions = train,
num_threads=num_threads)
rec = recall_at_k(self.model,
test_interactions = test,
train_interactions = train,
num_threads=num_threads)
auc = auc_score(self.model,
test_interactions = test,
train_interactions = train,
num_threads=num_threads)
return prec, rec, auc
def recommend_random(self, ratings, books, seed):
'''
This function will pick a random user out of the list of known users and print out their top
10 recommendations! It will also print their top 10 rated items for comparison.
inputs:
self - class object.
seed - random seed for reproducibility.
outputs:
null - no need to output anything right now.
'''
random.seed(seed)
user = random.choices(ratings.uid.unique().tolist())[0]
# now let's predict them on our trained model
itemList = np.array(ratings.iid.unique().tolist())
itemList.sort()
knownRatings = pd.merge(ratings.query('uid == @user'),
books[['iid', 'title', 'authors']], on='iid', how='left')
score = self.model.predict(user, itemList)
suggested = books.loc[np.argsort(-score)][['title', 'authors']]
print(color.BOLD + 'User {} known items: '.format(user) + color.END, end='\n')
display(HTML(knownRatings[['title', 'authors', 'rating']]
.sort_values(by='rating', ascending=False).iloc[:10].to_html()))
print(color.BOLD + 'Top 10 suggested items:' + color.END, end='\n')
display(HTML(suggested[:10].to_html()))
class color:
'''
This is for printing purposes.
'''
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
def sample_hyperparameters():
"""
Yield possible hyperparameter choices.
"""
return {
"no_components": np.random.randint(5, 64),
"learning_schedule": np.random.choice(["adagrad", "adadelta"]),
"loss": np.random.choice(["bpr", "warp", "warp-kos"]),
"learning_rate": np.random.exponential(0.05),
"item_alpha": np.random.exponential(1e-8),
"user_alpha": np.random.exponential(1e-8),
"max_sampled": np.random.randint(5, 15),
"epochs": np.random.randint(5, 50),
}