Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6ae5ef9
BUG: groupby(..., dropna=False) drops null values with categorical gr…
rhshadrach Oct 5, 2022
20e17ab
Use intp
rhshadrach Nov 12, 2022
29ee263
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 12, 2022
8254619
Fixups
rhshadrach Nov 13, 2022
f679fd7
Use intp
rhshadrach Nov 13, 2022
9a3fb30
Merge branch 'main' into groupby_dropna_filtering
rhshadrach Nov 15, 2022
34760ca
int64
rhshadrach Nov 16, 2022
93f306c
dtype fix
rhshadrach Nov 19, 2022
d6100b4
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 19, 2022
f3a3ebb
Breakup op to debug on CI
rhshadrach Nov 20, 2022
4bfeaa1
Trying with intp
rhshadrach Nov 20, 2022
af9d90c
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 20, 2022
45f3947
Merge branch 'main' into groupby_dropna_filtering
rhshadrach Nov 28, 2022
4d72402
Restore cache decorator
rhshadrach Nov 29, 2022
6f2f51d
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 29, 2022
9ddc2d0
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 29, 2022
1e3bff3
Add bincount comment
rhshadrach Nov 29, 2022
5a42eeb
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 29, 2022
c8ba7ad
Rework recoding logic
rhshadrach Nov 30, 2022
a548506
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 30, 2022
9bec396
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Dec 2, 2022
867e97a
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Dec 2, 2022
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)

Reshaping
^^^^^^^^^
Expand Down
23 changes: 23 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,29 @@ def unique(values):
return unique_with_mask(values)


def nunique_ints(values: ArrayLike) -> int:
"""
Return the number of unique values for integer array-likes.

Significantly faster than pandas.unique for long enough sequences.
No checks are done to ensure input is integral.

Parameters
----------
values : 1d array-like

Returns
-------
int : The number of unique values in ``values``
"""
if len(values) == 0:
return 0
values = _ensure_data(values)
# bincount requires intp
result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
return result


def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
values = _ensure_arraylike(values)
Expand Down
25 changes: 17 additions & 8 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class providing the base-class of operations.
Timestamp,
lib,
)
from pandas._libs.algos import rank_1d
import pandas._libs.groupby as libgroupby
from pandas._typing import (
AnyArrayLike,
Expand Down Expand Up @@ -2268,12 +2269,15 @@ def size(self) -> DataFrame | Series:
else:
result = self._obj_1d_constructor(result)

with com.temp_setattr(self, "as_index", True):
# size already has the desired behavior in GH#49519, but this makes the
# as_index=False path of _reindex_output fail on categorical groupers.
result = self._reindex_output(result, fill_value=0)
if not self.as_index:
# error: Incompatible types in assignment (expression has
# type "DataFrame", variable has type "Series")
result = result.rename("size").reset_index() # type: ignore[assignment]

return self._reindex_output(result, fill_value=0)
return result

@final
@doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
Expand Down Expand Up @@ -3269,6 +3273,10 @@ def ngroup(self, ascending: bool = True):
else:
dtype = np.int64

if any(ping._passed_categorical for ping in self.grouper.groupings):
# comp_ids reflect non-observed groups, we need only observed
comp_ids = rank_1d(comp_ids, ties_method="dense") - 1

result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
if not ascending:
result = self.ngroups - 1 - result
Expand Down Expand Up @@ -3950,7 +3958,7 @@ def _reindex_output(
names = names + [None]
index = MultiIndex.from_product(levels_list, names=names)
if self.sort:
index = index.sortlevel()[0]
index = index.sort_values()

if self.as_index:
# Always holds for SeriesGroupBy unless GH#36507 is implemented
Expand All @@ -3972,12 +3980,12 @@ def _reindex_output(
# reindex `output`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = (
in_axis_grps = list(
(i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
)
g_nums, g_names = zip(*in_axis_grps)

output = output.drop(labels=list(g_names), axis=1)
if len(in_axis_grps) > 0:
g_nums, g_names = zip(*in_axis_grps)
output = output.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
output = output.set_index(self.grouper.result_index).reindex(
Expand All @@ -3986,7 +3994,8 @@ def _reindex_output(

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
output = output.reset_index(level=g_nums)
if len(in_axis_grps) > 0:
output = output.reset_index(level=g_nums)

return output.reset_index(drop=True)

Expand Down
47 changes: 43 additions & 4 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,9 @@ def group_arraylike(self) -> ArrayLike:
# retain dtype for categories, including unobserved ones
return self.result_index._values

elif self._passed_categorical:
return self.group_index._values

return self._codes_and_uniques[1]

@cache_readonly
Expand All @@ -621,14 +624,31 @@ def result_index(self) -> Index:
if self._all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
categories = self._all_grouper.categories
cats = self._orig_cats
# set_categories is dynamically added
return group_idx.set_categories(categories) # type: ignore[attr-defined]
return group_idx.set_categories(cats) # type: ignore[attr-defined]
return self.group_index

@cache_readonly
def group_index(self) -> Index:
uniques = self._codes_and_uniques[1]
codes, uniques = self._codes_and_uniques
if not self._dropna and self._passed_categorical:
assert isinstance(uniques, Categorical)
if self._sort and (codes == len(uniques)).any():
# Add NA value on the end when sorting
uniques = Categorical.from_codes(
np.append(uniques.codes, [-1]), uniques.categories
)
else:
# Need to determine proper placement of NA value when not sorting
cat = self.grouping_vector
na_idx = (cat.codes < 0).argmax()
if cat.codes[na_idx] < 0:
# count number of unique codes that comes before the nan value
na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
uniques = Categorical.from_codes(
np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
)
return Index._with_infer(uniques, name=self.name)

@cache_readonly
Expand All @@ -651,9 +671,28 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)

codes = cat.codes
if not self._dropna:
na_mask = codes < 0
if np.any(na_mask):
if self._sort:
# Replace NA codes with `largest code + 1`
na_code = len(categories)
codes = np.where(na_mask, na_code, codes)
else:
# Insert NA code into the codes based on first appearance
# A negative code must exist, no need to check codes[na_idx] < 0
na_idx = na_mask.argmax()
# count number of unique codes that comes before the nan value
na_code = algorithms.nunique_ints(codes[:na_idx])
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)

if not self._observed:
uniques = uniques.reorder_categories(self._orig_cats)
return cat.codes, uniques

return codes, uniques

elif isinstance(self.grouping_vector, ops.BaseGrouper):
# we have a list of groupers
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ def test_preserve_categories():
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
# GH#48749 - don't change order of categories
# GH#42482 - don't sort result when sort=False, even when ordered=True
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
tm.assert_index_equal(
df.groupby("A", sort=True, observed=False).first().index, sort_index
Expand Down Expand Up @@ -1218,7 +1219,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
index = MultiIndex.from_arrays([lev_a, lev_b])
expected = Series(data=[2, 4, 1, 3], index=index, name="C")
expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()

grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
result = getattr(grouped, operation)(sum)
Expand Down
Loading