-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
fix a indices bug for categorical-datetime columns #26860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
6f8fdc0 cfa0fef 806f980 27086ba 0d5385f 3820a94 51bda3a b6ba161 f833ece 650a3ec 88630ce c926c06 39f394e 182de89 7b5b370 c700c1a 5543e0d 4ae7db8 abcfaff 85b3b1a File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -305,6 +305,8 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): | |
| | ||
| def get_indexer_dict(label_list, keys): | ||
| """ return a diction of {labels} -> {indexers} """ | ||
| # address GH 26860 | ||
| keys = [np.asarray(key) for key in keys] | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the types on I worry a bit about doing this on a DatetimeIndex with tz. That will emit a warning, since we're changing how we handle datetimes in Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, I'm not all that sure what is going into | ||
| shape = list(map(len, keys)) | ||
| | ||
| group_index = get_group_index(label_list, shape, sort=True, xnull=True) | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -351,6 +351,98 @@ def f3(x): | |
| df2.groupby("a").apply(f3) | ||
| | ||
| | ||
| def test_groupby_indices_error(): | ||
| # GH 26860 | ||
| # Test if DataFrame Groupby builds gb.indices | ||
| dt = pd.to_datetime(['2018-01-01', '2018-02-01', '2018-03-01']) | ||
| df = pd.DataFrame({ | ||
| 'a': pd.Series(list('abc')), | ||
| 'b': pd.Series(dt, dtype='category'), | ||
| 'c': pd.Categorical.from_codes([-1, 0, 1], categories=[0, 1]) | ||
| }) | ||
| | ||
| df.groupby(['a', 'b']).indices | ||
| | ||
| | ||
| def test_groupby_indices_output(): | ||
| ||
| # GH 26860 | ||
| # Test if DataFrame Groupby builds gb.indices correctly. | ||
| | ||
| cols = [ | ||
| 'int_series', 'int_series_cat', 'float_series', 'float_series_cat', | ||
| 'dt_series', 'dt_series_cat', 'period_series', 'period_series_cat' | ||
| ] | ||
| | ||
| int_series = pd.Series([1, 2, 3]) | ||
| dt_series = pd.to_datetime(['2018Q1', '2018Q2', '2018Q3']) | ||
| df = pd.DataFrame( | ||
| data={ | ||
| 'int_series': int_series, | ||
| 'int_series_cat': int_series.astype('category'), | ||
| 'float_series': int_series.astype('float'), | ||
| 'float_series_cat': int_series.astype('float').astype('category'), | ||
| 'dt_series': dt_series, | ||
| 'dt_series_cat': dt_series.astype('category'), | ||
| 'period_series': dt_series.to_period('Q'), | ||
| 'period_series_cat': dt_series.to_period('Q').astype('category') | ||
| }, | ||
| columns=cols | ||
| ) | ||
| | ||
| from itertools import chain, combinations | ||
| | ||
| gb_cols_it = chain.from_iterable( | ||
| combinations(cols, n + 1) for n in range(len(cols)) | ||
| ) | ||
| for gb_cols in gb_cols_it: | ||
| gb_cols = list(gb_cols) | ||
| num_gb_cols = len(gb_cols) | ||
| | ||
| if num_gb_cols == 1: | ||
| s = df[gb_cols[0]] | ||
| col_vals = list(s.unique()) | ||
| | ||
| if pd.api.types.is_datetime64_any_dtype(s): | ||
| col_vals = list(map(pd.Timestamp, col_vals)) | ||
| | ||
| target = { | ||
| key: np.array([i]) | ||
| for i, key in enumerate(col_vals) | ||
| } | ||
| else: | ||
| col_vals = { | ||
| col: list(df[col].unique()) | ||
| for col in gb_cols | ||
| } | ||
| | ||
| def to_dt(elems): | ||
| elems = map(pd.Timestamp, elems) | ||
| elems = map(lambda dt: dt.to_datetime64(), elems) | ||
| elems = list(elems) | ||
| return elems | ||
alexifm marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| for col in gb_cols: | ||
| if pd.api.types.is_datetime64_any_dtype(df[col]): | ||
| col_vals[col] = to_dt(col_vals[col]) | ||
| | ||
| elif pd.api.types.is_categorical_dtype(df[col]): | ||
| if pd.api.types.is_datetime64_any_dtype(df[col].cat.categories): | ||
| col_vals[col] = to_dt(col_vals[col]) | ||
alexifm marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| | ||
| it = zip(*(col_vals[col] for col in gb_cols)) | ||
| target = { | ||
| key: np.array([i]) | ||
| for i, key in enumerate(it) | ||
| } | ||
| | ||
| indices = df.groupby(gb_cols).indices | ||
| | ||
| assert set(target.keys()) == set(indices.keys()) | ||
| for key in target.keys(): | ||
| assert pd.core.dtypes.missing.array_equivalent( | ||
| target[key], indices[key]) | ||
| | ||
| | ||
| def test_attr_wrapper(ts): | ||
| grouped = ts.groupby(lambda x: x.weekday()) | ||
| | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is a public function. Can you rephrase this to make sense for an end user?
And can you move the release note to the 1.0.0 whatsenew?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make more sense to put it in terms of
gb.indiceswhich was where the problem originally came about?