mquad
diff --git a/‎07_KNN.ipynb‎
Lines changed: 627 additions & 0 deletions b/‎07_KNN.ipynb‎
Lines changed: 627 additions & 0 deletions
diff --git a/‎recommenders/KNNRecommender.py‎
Lines changed: 86 additions & 0 deletions b/‎recommenders/KNNRecommender.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎util/knn/__init__.py‎ b/‎util/knn/__init__.py‎
diff --git a/‎util/knn/iknn.py‎
Lines changed: 117 additions & 0 deletions b/‎util/knn/iknn.py‎
Lines changed: 117 additions & 0 deletions
@@ -0,0 +1,86 @@
+from recommenders.ISeqRecommender import ISeqRecommender
+from util.data_utils import dataset_to_gru4rec_format
+from util.knn.iknn import ItemKNN
+from util.knn.sknn import SessionKNN
+from util.knn.vmsknn import VMSessionKNN
+from util.knn.ssknn import SeqSessionKNN
+from util.knn.sfsknn import SeqFilterSessionKNN
+
+
+class KNNRecommender(ISeqRecommender):
+ """
+ Interface to ItemKNN and Session-based KNN methods. Based on:
+
+ Evaluation of Session-based Recommendation Algorithms, Malte Ludewig and Dietmar Jannach
+ """
+ knn_models = {
+ 'iknn': ItemKNN,
+ 'sknn': SessionKNN,
+ 'v-sknn': VMSessionKNN,
+ 's-sknn': SeqSessionKNN,
+ 'sf-sknn': SeqFilterSessionKNN
+ }
+
+ def __init__(self,
+ model='cknn',
+ **init_args):
+ """
+ :param model: One among the following KNN models:
+ - iknn: ItemKNN, item-to-item KNN based on the *last* item in the session to determine the items to be recommended.
+ - sknn: SessionKNN, compares the *entire* current session with the past sessions in the training data to
+ determine the items to be recommended.
+ - v-sknn: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session,
+ then compares the current session with the past sessions in the training data using the dot-product
+ to determine the items to be recommended.
+ - s-sknn: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by
+ using a custom scoring function (see the paper by Ludewng and Jannach).
+ - sf-sknn: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session
+ in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach).
+
+ :param init_args: The model initialization arguments. See the following initializations or
+ check `util.knn` for more details on each model:
+ - iknn: ItemKNN(n_sims=100, lmbd=20, alpha=0.5)
+ - sknn: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0)
+ - v-sknn: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
+ dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
+ weighting_time=False, normalize=True)
+ - s-knn: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div',
+ remind=False, pop_boost=0, extend=False, normalize=True)
+ - sf-sknn: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
+ extend=False, normalize=True)
+ """
+ super(KNNRecommender).__init__()
+ if model not in self.knn_models:
+ raise ValueError("Unknown KNN model '{}'. The available ones are: {}".format(
+ model, list(self.knn_models.keys())
+ ))
+ self.init_args = init_args
+ self.init_args.update(dict(session_key='session_id',
+ item_key='item_id',
+ time_key='ts'))
+ self.model = self.knn_models[model](**self.init_args)
+ self.pseudo_session_id = 0
+
+ def __str__(self):
+ return str(self.model)
+
+ def fit(self, train_data):
+ self.logger.info('Converting training data to GRU4Rec format')
+ # parse training data to GRU4Rec format
+ train_data = dataset_to_gru4rec_format(dataset=train_data)
+
+ self.logger.info('Training started')
+ self.model.fit(train_data)
+ self.logger.info('Training completed')
+ self.pseudo_session_id = 0
+
+ def recommend(self, user_profile, user_id=None):
+ for item in user_profile:
+ pred = self.model.predict_next(session_id=self.pseudo_session_id,
+ input_item_id=item)
+ # sort items by predicted score
+ pred.sort_values(0, ascending=False, inplace=True)
+ # increase the psuedo-session id so that future call to recommend() won't be connected
+ self.pseudo_session_id += 1
+ # convert to the required output format
+ return [([x.index], x._2) for x in pred.reset_index().itertuples()]
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 26 11:57:27 2015
+@author: Balázs Hidasi
+"""
+
+import numpy as np
+import pandas as pd
+
+
+class ItemKNN:
+ '''
+ ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
+ 
+ Item-to-item predictor that computes the the similarity to all items to the given item.
+ 
+ Similarity of two items is given by:
+ 
+ .. math::
+ s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
+ 
+ Parameters
+ --------
+ n_sims : int
+ Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
+ lmbd : float
+ Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
+ alpha : float
+ Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
+ session_key : string
+ header of the session ID column in the input file (default: 'SessionId')
+ item_key : string
+ header of the item ID column in the input file (default: 'ItemId')
+ time_key : string
+ header of the timestamp column in the input file (default: 'Time')
+ 
+ '''
+
+ def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
+ self.n_sims = n_sims
+ self.lmbd = lmbd
+ self.alpha = alpha
+ self.item_key = item_key
+ self.session_key = session_key
+ self.time_key = time_key
+
+ def fit(self, data):
+ '''
+ Trains the predictor.
+ 
+ Parameters
+ --------
+ data: pandas.DataFrame
+ Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
+ It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
+ 
+ '''
+ data.set_index(np.arange(len(data)), inplace=True)
+ self.itemids = data[self.item_key].unique()
+ n_items = len(self.itemids)
+ data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}),
+ on=self.item_key, how='inner')
+ sessionids = data[self.session_key].unique()
+ data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
+ on=self.session_key, how='inner')
+ supp = data.groupby('SessionIdx').size()
+ session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
+ session_offsets[1:] = supp.cumsum()
+ index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
+ supp = data.groupby('ItemIdx').size()
+ item_offsets = np.zeros(n_items + 1, dtype=np.int32)
+ item_offsets[1:] = supp.cumsum()
+ index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
+ self.sims = dict()
+ for i in range(n_items):
+ iarray = np.zeros(n_items)
+ start = item_offsets[i]
+ end = item_offsets[i + 1]
+ for e in index_by_items[start:end]:
+ uidx = data.SessionIdx.values[e]
+ ustart = session_offsets[uidx]
+ uend = session_offsets[uidx + 1]
+ user_events = index_by_sessions[ustart:uend]
+ iarray[data.ItemIdx.values[user_events]] += 1
+ iarray[i] = 0
+ norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
+ norm[norm == 0] = 1
+ iarray = iarray / norm
+ indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
+ self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices])
+
+ def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
+ '''
+ Gives predicton scores for a selected set of items on how likely they be the next item in the session.
+ 
+ Parameters
+ --------
+ session_id : int or string
+ The session IDs of the event.
+ input_item_id : int or string
+ The item ID of the event. Must be in the set of item IDs of the training set.
+ predict_for_item_ids : 1D array
+ IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
+ 
+ Returns
+ --------
+ out : pandas.Series
+ Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
+ 
+ '''
+ if predict_for_item_ids is None:
+ predict_for_item_ids = self.itemids
+ preds = np.zeros(len(predict_for_item_ids))
+ sim_list = self.sims[input_item_id]
+ mask = np.in1d(predict_for_item_ids, sim_list.index)
+ preds[mask] = sim_list[predict_for_item_ids[mask]]
+ return pd.Series(data=preds, index=predict_for_item_ids)