Skip to content
7 changes: 3 additions & 4 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
is_list_like,
is_sequence,
)
from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries
from pandas.core.dtypes.generic import ABCSeries

from pandas.core.construction import create_series_with_explicit_dtype

Expand Down Expand Up @@ -278,9 +278,8 @@ def apply_standard(self):
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_array_dtype).any()
# Disallow complex_internals since libreduction shortcut
# cannot handle MultiIndex
and not isinstance(self.agg_axis, ABCMultiIndex)
# Disallow complex_internals since libreduction shortcut raises a TypeError
and not self.agg_axis._has_complex_internals
):

values = self.values
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0):
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
and axis == 0
# apply_frame_axis0 doesn't allow MultiIndex
and not isinstance(sdata.index, MultiIndex)
# fast_apply/libreduction doesn't allow non-numpy backed indexes
and not sdata.index._has_complex_internals
):
try:
result_values, mutated = splitter.fast_apply(f, group_keys)
Expand Down Expand Up @@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func):
# TODO: can we get a performant workaround for EAs backed by ndarray?
return self._aggregate_series_pure_python(obj, func)

elif isinstance(obj.index, MultiIndex):
# MultiIndex; Pre-empt TypeError in _aggregate_series_fast
elif obj.index._has_complex_internals:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now excludes PeriodIndex, which previously worked fine since .values converted to a numpy array. It looks more performant to exclude PeriodIndex though, since we avoid the conversion to numpy:

In [1]: import numpy as np ...: import pandas as pd ...: from string import ascii_letters ...: ...: np.random.seed(123) ...: group = np.random.choice(list(ascii_letters), 10**5) ...: value = np.random.randint(12345, size=10**5) ...: index = pd.period_range("2000", freq="D", periods=10**5) ...: df = pd.DataFrame({"group": group, "value": value}, index=index) In [2]: %timeit df.groupby("group").agg({"value": pd.Series.nunique}) 17.8 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) # on this branch 95.9 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) # on master
# Pre-empt TypeError in _aggregate_series_fast
return self._aggregate_series_pure_python(obj, func)

try:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4109,6 +4109,14 @@ def _assert_can_do_op(self, value):
if not is_scalar(value):
raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")

@property
def _has_complex_internals(self):
"""
Indicates if an index is not directly backed by a numpy array
"""
# used to avoid libreduction code paths, which raise or require conversion
return False

def _is_memory_usage_qualified(self) -> bool:
"""
Return a boolean if we need a qualified .info display.
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,11 @@ def values(self):
""" return the underlying data, which is a Categorical """
return self._data

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def _wrap_setop_result(self, other, result):
name = get_op_result_name(self, other)
# We use _shallow_copy rather than the Index implementation
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ def values(self):
"""
return self._data

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def __array_wrap__(self, result, context=None):
# we don't want the superclass implementation
return result
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,11 @@ def values(self):
self._tuples = lib.fast_zip(values)
return self._tuples

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

@cache_readonly
def is_monotonic_increasing(self) -> bool:
"""
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
def values(self):
return np.asarray(self)

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def _shallow_copy(self, values=None, **kwargs):
# TODO: simplify, figure out type of values
if values is None:
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,23 @@ def test_func_duplicates_raises():
df.groupby("A").agg(["min", "min"])


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_agg_index_has_complex_internals(index):
# GH 31223
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").agg({"value": Series.nunique})
expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
tm.assert_frame_equal(result, expected)


class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,3 +811,19 @@ def test_groupby_apply_datetime_result_dtypes():
index=["observation", "color", "mood", "intensity", "score"],
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_apply_index_has_complex_internals(index):
# GH 31248
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").apply(lambda x: x)
tm.assert_frame_equal(result, df)