|
| 1 | +""" |
| 2 | +Interaction with scipy.sparse matrices. |
| 3 | +
|
| 4 | +Currently only includes SparseSeries.to_coo helpers. |
| 5 | +""" |
| 6 | +from pandas.core.frame import DataFrame |
| 7 | +from pandas.core.index import MultiIndex, Index |
| 8 | +from pandas.core.series import Series |
| 9 | +import itertools |
| 10 | +import numpy |
| 11 | +from pandas.compat import OrderedDict |
| 12 | +from pandas.tools.util import cartesian_product |
| 13 | + |
| 14 | + |
| 15 | +def _get_label_to_i_dict(labels, sort_labels=False): |
| 16 | + """ Return OrderedDict of unique labels to number. Optionally sort by label. """ |
| 17 | + labels = Index(map(tuple, labels)).unique().tolist() # squish |
| 18 | + if sort_labels: |
| 19 | + labels = sorted(list(labels)) |
| 20 | + d = OrderedDict((k, i) for i, k in enumerate(labels)) |
| 21 | + return(d) |
| 22 | + |
| 23 | + |
| 24 | +def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): |
| 25 | + ilabels = list(zip(*[index.get_level_values(i) for i in subset])) |
| 26 | + labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) |
| 27 | + return(labels_to_i) |
| 28 | + |
| 29 | + |
| 30 | +def _check_is_partition(parts, whole): |
| 31 | + whole = set(whole) |
| 32 | + parts = [set(x) for x in parts] |
| 33 | + if set.intersection(*parts) != set(): |
| 34 | + raise ValueError( |
| 35 | + 'Is not a partition because intersection is not null.') |
| 36 | + if set.union(*parts) != whole: |
| 37 | + raise ValueError('Is not a partition becuase union is not the whole.') |
| 38 | + |
| 39 | + |
| 40 | +def _to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): |
| 41 | + """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for |
| 42 | + passing to scipy.sparse.coo constructor. """ |
| 43 | + # index and column levels must be a partition of the index |
| 44 | + _check_is_partition([ilevels, jlevels], range(ss.index.nlevels)) |
| 45 | + |
| 46 | + # from the SparseSeries: get the labels and data for non-null entries |
| 47 | + values = ss._data.values._valid_sp_values |
| 48 | + blocs = ss._data.values.sp_index.blocs |
| 49 | + blength = ss._data.values.sp_index.blengths |
| 50 | + nonnull_labels = list( |
| 51 | + itertools.chain(*[ss.index.values[i:(i + j)] for i, j in zip(blocs, blength)])) |
| 52 | + |
| 53 | + def get_indexers(levels): |
| 54 | + """ Return sparse coords and dense labels for subset levels """ |
| 55 | + values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels] |
| 56 | + labels_to_i = _get_index_subset_to_coord_dict( |
| 57 | + ss.index, levels, sort_labels=sort_labels) |
| 58 | + i_coord = [labels_to_i[i] for i in values_ilabels] |
| 59 | + return(i_coord, list(labels_to_i.keys())) |
| 60 | + |
| 61 | + i_coord, i_labels = get_indexers(ilevels) |
| 62 | + j_coord, j_labels = get_indexers(jlevels) |
| 63 | + |
| 64 | + return(values, i_coord, j_coord, i_labels, j_labels) |
| 65 | + |
| 66 | + |
| 67 | +def _sparse_series_to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): |
| 68 | + """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index levels ilevels, jlevels as the row and column |
| 69 | + labels respectively. Returns the sparse_matrix, row and column labels. """ |
| 70 | + if ss.index.nlevels < 2: |
| 71 | + raise ValueError('to_coo requires MultiIndex with nlevels > 2') |
| 72 | + if not ss.index.is_unique: |
| 73 | + raise ValueError( |
| 74 | + 'Duplicate index entries are not allowed in to_coo transformation.') |
| 75 | + |
| 76 | + # to keep things simple, only rely on integer indexing (not labels) |
| 77 | + ilevels = [ss.index._get_level_number(x) for x in ilevels] |
| 78 | + jlevels = [ss.index._get_level_number(x) for x in jlevels] |
| 79 | + ss = ss.copy() |
| 80 | + ss.index.names = [None] * ss.index.nlevels # kill any existing labels |
| 81 | + |
| 82 | + v, i, j, il, jl = _to_ijv( |
| 83 | + ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels) |
| 84 | + import scipy.sparse |
| 85 | + sparse_matrix = scipy.sparse.coo_matrix( |
| 86 | + (v, (i, j)), shape=(len(il), len(jl))) |
| 87 | + return(sparse_matrix, il, jl) |
| 88 | + |
| 89 | + |
| 90 | +def _coo_to_sparse_series(A, dense_index=False): |
| 91 | + """ Convert a scipy.sparse.coo_matrix to a SparseSeries. |
| 92 | + Use the defaults given in the SparseSeries constructor. """ |
| 93 | + s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) |
| 94 | + s = s.sort_index() |
| 95 | + s = s.to_sparse() # TODO: specify kind? |
| 96 | + if dense_index: |
| 97 | + # is there a better constructor method to use here? |
| 98 | + i = range(A.shape[0]) |
| 99 | + j = range(A.shape[1]) |
| 100 | + ind = MultiIndex.from_product([i, j]) |
| 101 | + s = s.reindex_axis(ind) |
| 102 | + return(s) |
0 commit comments