Skip to content

Commit 72dea8a

Browse files
author
Massimo Quadrana
committed
add knn models and notebook
1 parent ccc54d4 commit 72dea8a

File tree

8 files changed

+3039
-0
lines changed

8 files changed

+3039
-0
lines changed

07_KNN.ipynb

Lines changed: 627 additions & 0 deletions
Large diffs are not rendered by default.

recommenders/KNNRecommender.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from recommenders.ISeqRecommender import ISeqRecommender
2+
from util.data_utils import dataset_to_gru4rec_format
3+
from util.knn.iknn import ItemKNN
4+
from util.knn.sknn import SessionKNN
5+
from util.knn.vmsknn import VMSessionKNN
6+
from util.knn.ssknn import SeqSessionKNN
7+
from util.knn.sfsknn import SeqFilterSessionKNN
8+
9+
10+
class KNNRecommender(ISeqRecommender):
11+
"""
12+
Interface to ItemKNN and Session-based KNN methods. Based on:
13+
14+
Evaluation of Session-based Recommendation Algorithms, Malte Ludewig and Dietmar Jannach
15+
"""
16+
knn_models = {
17+
'iknn': ItemKNN,
18+
'sknn': SessionKNN,
19+
'v-sknn': VMSessionKNN,
20+
's-sknn': SeqSessionKNN,
21+
'sf-sknn': SeqFilterSessionKNN
22+
}
23+
24+
def __init__(self,
25+
model='cknn',
26+
**init_args):
27+
"""
28+
:param model: One among the following KNN models:
29+
- iknn: ItemKNN, item-to-item KNN based on the *last* item in the session to determine the items to be recommended.
30+
- sknn: SessionKNN, compares the *entire* current session with the past sessions in the training data to
31+
determine the items to be recommended.
32+
- v-sknn: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session,
33+
then compares the current session with the past sessions in the training data using the dot-product
34+
to determine the items to be recommended.
35+
- s-sknn: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by
36+
using a custom scoring function (see the paper by Ludewng and Jannach).
37+
- sf-sknn: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session
38+
in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach).
39+
40+
:param init_args: The model initialization arguments. See the following initializations or
41+
check `util.knn` for more details on each model:
42+
- iknn: ItemKNN(n_sims=100, lmbd=20, alpha=0.5)
43+
- sknn: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0)
44+
- v-sknn: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
45+
dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
46+
weighting_time=False, normalize=True)
47+
- s-knn: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div',
48+
remind=False, pop_boost=0, extend=False, normalize=True)
49+
- sf-sknn: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
50+
extend=False, normalize=True)
51+
"""
52+
super(KNNRecommender).__init__()
53+
if model not in self.knn_models:
54+
raise ValueError("Unknown KNN model '{}'. The available ones are: {}".format(
55+
model, list(self.knn_models.keys())
56+
))
57+
self.init_args = init_args
58+
self.init_args.update(dict(session_key='session_id',
59+
item_key='item_id',
60+
time_key='ts'))
61+
self.model = self.knn_models[model](**self.init_args)
62+
self.pseudo_session_id = 0
63+
64+
def __str__(self):
65+
return str(self.model)
66+
67+
def fit(self, train_data):
68+
self.logger.info('Converting training data to GRU4Rec format')
69+
# parse training data to GRU4Rec format
70+
train_data = dataset_to_gru4rec_format(dataset=train_data)
71+
72+
self.logger.info('Training started')
73+
self.model.fit(train_data)
74+
self.logger.info('Training completed')
75+
self.pseudo_session_id = 0
76+
77+
def recommend(self, user_profile, user_id=None):
78+
for item in user_profile:
79+
pred = self.model.predict_next(session_id=self.pseudo_session_id,
80+
input_item_id=item)
81+
# sort items by predicted score
82+
pred.sort_values(0, ascending=False, inplace=True)
83+
# increase the psuedo-session id so that future call to recommend() won't be connected
84+
self.pseudo_session_id += 1
85+
# convert to the required output format
86+
return [([x.index], x._2) for x in pred.reset_index().itertuples()]

util/knn/__init__.py

Whitespace-only changes.

util/knn/iknn.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Jun 26 11:57:27 2015
4+
@author: Balázs Hidasi
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
10+
11+
class ItemKNN:
12+
'''
13+
ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
14+
15+
Item-to-item predictor that computes the the similarity to all items to the given item.
16+
17+
Similarity of two items is given by:
18+
19+
.. math::
20+
s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
21+
22+
Parameters
23+
--------
24+
n_sims : int
25+
Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
26+
lmbd : float
27+
Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
28+
alpha : float
29+
Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
30+
session_key : string
31+
header of the session ID column in the input file (default: 'SessionId')
32+
item_key : string
33+
header of the item ID column in the input file (default: 'ItemId')
34+
time_key : string
35+
header of the timestamp column in the input file (default: 'Time')
36+
37+
'''
38+
39+
def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
40+
self.n_sims = n_sims
41+
self.lmbd = lmbd
42+
self.alpha = alpha
43+
self.item_key = item_key
44+
self.session_key = session_key
45+
self.time_key = time_key
46+
47+
def fit(self, data):
48+
'''
49+
Trains the predictor.
50+
51+
Parameters
52+
--------
53+
data: pandas.DataFrame
54+
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
55+
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
56+
57+
'''
58+
data.set_index(np.arange(len(data)), inplace=True)
59+
self.itemids = data[self.item_key].unique()
60+
n_items = len(self.itemids)
61+
data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}),
62+
on=self.item_key, how='inner')
63+
sessionids = data[self.session_key].unique()
64+
data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
65+
on=self.session_key, how='inner')
66+
supp = data.groupby('SessionIdx').size()
67+
session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
68+
session_offsets[1:] = supp.cumsum()
69+
index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
70+
supp = data.groupby('ItemIdx').size()
71+
item_offsets = np.zeros(n_items + 1, dtype=np.int32)
72+
item_offsets[1:] = supp.cumsum()
73+
index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
74+
self.sims = dict()
75+
for i in range(n_items):
76+
iarray = np.zeros(n_items)
77+
start = item_offsets[i]
78+
end = item_offsets[i + 1]
79+
for e in index_by_items[start:end]:
80+
uidx = data.SessionIdx.values[e]
81+
ustart = session_offsets[uidx]
82+
uend = session_offsets[uidx + 1]
83+
user_events = index_by_sessions[ustart:uend]
84+
iarray[data.ItemIdx.values[user_events]] += 1
85+
iarray[i] = 0
86+
norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
87+
norm[norm == 0] = 1
88+
iarray = iarray / norm
89+
indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
90+
self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices])
91+
92+
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
93+
'''
94+
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
95+
96+
Parameters
97+
--------
98+
session_id : int or string
99+
The session IDs of the event.
100+
input_item_id : int or string
101+
The item ID of the event. Must be in the set of item IDs of the training set.
102+
predict_for_item_ids : 1D array
103+
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
104+
105+
Returns
106+
--------
107+
out : pandas.Series
108+
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
109+
110+
'''
111+
if predict_for_item_ids is None:
112+
predict_for_item_ids = self.itemids
113+
preds = np.zeros(len(predict_for_item_ids))
114+
sim_list = self.sims[input_item_id]
115+
mask = np.in1d(predict_for_item_ids, sim_list.index)
116+
preds[mask] = sim_list[predict_for_item_ids[mask]]
117+
return pd.Series(data=preds, index=predict_for_item_ids)

0 commit comments

Comments
 (0)