|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Fri Jun 26 11:57:27 2015 |
| 4 | +@author: Balázs Hidasi |
| 5 | +""" |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | + |
| 11 | +class ItemKNN: |
| 12 | + ''' |
| 13 | + ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time') |
| 14 | + |
| 15 | + Item-to-item predictor that computes the the similarity to all items to the given item. |
| 16 | + |
| 17 | + Similarity of two items is given by: |
| 18 | + |
| 19 | + .. math:: |
| 20 | + s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha} |
| 21 | + |
| 22 | + Parameters |
| 23 | + -------- |
| 24 | + n_sims : int |
| 25 | + Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100) |
| 26 | + lmbd : float |
| 27 | + Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20) |
| 28 | + alpha : float |
| 29 | + Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules). |
| 30 | + session_key : string |
| 31 | + header of the session ID column in the input file (default: 'SessionId') |
| 32 | + item_key : string |
| 33 | + header of the item ID column in the input file (default: 'ItemId') |
| 34 | + time_key : string |
| 35 | + header of the timestamp column in the input file (default: 'Time') |
| 36 | + |
| 37 | + ''' |
| 38 | + |
| 39 | + def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'): |
| 40 | + self.n_sims = n_sims |
| 41 | + self.lmbd = lmbd |
| 42 | + self.alpha = alpha |
| 43 | + self.item_key = item_key |
| 44 | + self.session_key = session_key |
| 45 | + self.time_key = time_key |
| 46 | + |
| 47 | + def fit(self, data): |
| 48 | + ''' |
| 49 | + Trains the predictor. |
| 50 | + |
| 51 | + Parameters |
| 52 | + -------- |
| 53 | + data: pandas.DataFrame |
| 54 | + Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). |
| 55 | + It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). |
| 56 | + |
| 57 | + ''' |
| 58 | + data.set_index(np.arange(len(data)), inplace=True) |
| 59 | + self.itemids = data[self.item_key].unique() |
| 60 | + n_items = len(self.itemids) |
| 61 | + data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}), |
| 62 | + on=self.item_key, how='inner') |
| 63 | + sessionids = data[self.session_key].unique() |
| 64 | + data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}), |
| 65 | + on=self.session_key, how='inner') |
| 66 | + supp = data.groupby('SessionIdx').size() |
| 67 | + session_offsets = np.zeros(len(supp) + 1, dtype=np.int32) |
| 68 | + session_offsets[1:] = supp.cumsum() |
| 69 | + index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values |
| 70 | + supp = data.groupby('ItemIdx').size() |
| 71 | + item_offsets = np.zeros(n_items + 1, dtype=np.int32) |
| 72 | + item_offsets[1:] = supp.cumsum() |
| 73 | + index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values |
| 74 | + self.sims = dict() |
| 75 | + for i in range(n_items): |
| 76 | + iarray = np.zeros(n_items) |
| 77 | + start = item_offsets[i] |
| 78 | + end = item_offsets[i + 1] |
| 79 | + for e in index_by_items[start:end]: |
| 80 | + uidx = data.SessionIdx.values[e] |
| 81 | + ustart = session_offsets[uidx] |
| 82 | + uend = session_offsets[uidx + 1] |
| 83 | + user_events = index_by_sessions[ustart:uend] |
| 84 | + iarray[data.ItemIdx.values[user_events]] += 1 |
| 85 | + iarray[i] = 0 |
| 86 | + norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha)) |
| 87 | + norm[norm == 0] = 1 |
| 88 | + iarray = iarray / norm |
| 89 | + indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1] |
| 90 | + self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices]) |
| 91 | + |
| 92 | + def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0): |
| 93 | + ''' |
| 94 | + Gives predicton scores for a selected set of items on how likely they be the next item in the session. |
| 95 | + |
| 96 | + Parameters |
| 97 | + -------- |
| 98 | + session_id : int or string |
| 99 | + The session IDs of the event. |
| 100 | + input_item_id : int or string |
| 101 | + The item ID of the event. Must be in the set of item IDs of the training set. |
| 102 | + predict_for_item_ids : 1D array |
| 103 | + IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set. |
| 104 | + |
| 105 | + Returns |
| 106 | + -------- |
| 107 | + out : pandas.Series |
| 108 | + Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs. |
| 109 | + |
| 110 | + ''' |
| 111 | + if predict_for_item_ids is None: |
| 112 | + predict_for_item_ids = self.itemids |
| 113 | + preds = np.zeros(len(predict_for_item_ids)) |
| 114 | + sim_list = self.sims[input_item_id] |
| 115 | + mask = np.in1d(predict_for_item_ids, sim_list.index) |
| 116 | + preds[mask] = sim_list[predict_for_item_ids[mask]] |
| 117 | + return pd.Series(data=preds, index=predict_for_item_ids) |
0 commit comments