Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
POC: Don't special case Python builtin and NumPy functions
  • Loading branch information
rhshadrach committed May 28, 2023
commit 67c1c8dd59e35637297e37d6ac3db42213cad444
10 changes: 7 additions & 3 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1105,10 +1105,14 @@ def agg(self):
# we cannot FIRST try the vectorized evaluation, because
# then .agg and .apply would have different semantics if the
# operation is actually defined on the Series, e.g. str
try:
result = self.obj.apply(f)
except (ValueError, AttributeError, TypeError):
has_cython_func = f in com._orig_cython_table
if has_cython_func and not self.args and not self.kwargs:
result = f(self.obj)
else:
try:
result = self.obj.apply(f)
except (ValueError, AttributeError, TypeError):
result = f(self.obj)

return result

Expand Down
34 changes: 34 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,12 +560,46 @@ def require_length_match(data, index: Index) -> None:
# whereas np.min and np.max (which directly call obj.min and obj.max)
# default to axis=None.
_builtin_table = {
# builtins.sum: np.sum,
# builtins.max: np.maximum.reduce,
# builtins.min: np.minimum.reduce,
}

_orig_builtin_table = {
builtins.sum: np.sum,
builtins.max: np.maximum.reduce,
builtins.min: np.minimum.reduce,
}

_cython_table = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can _cython_table also ve removed?

Copy link
Member Author

@rhshadrach rhshadrach May 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to know whether the passed function is a NumPy function (maybe there is a better way to do this?) for backwards compatibility. This is because Series.apply treats these differently.

# builtins.sum: "sum",
# builtins.max: "max",
# builtins.min: "min",
# np.all: "all",
# np.any: "any",
# np.sum: "sum",
# np.nansum: "sum",
# np.mean: "mean",
# np.nanmean: "mean",
# np.prod: "prod",
# np.nanprod: "prod",
# np.std: "std",
# np.nanstd: "std",
# np.var: "var",
# np.nanvar: "var",
# np.median: "median",
# np.nanmedian: "median",
# np.max: "max",
# np.nanmax: "max",
# np.min: "min",
# np.nanmin: "min",
# np.cumprod: "cumprod",
# np.nancumprod: "cumprod",
# np.cumsum: "cumsum",
# np.nancumsum: "cumsum",
}

_orig_cython_table = {
builtins.sum: "sum",
builtins.max: "max",
builtins.min: "min",
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,13 +1502,14 @@ def foo2(x, b=2, c=0):

def test_agg_std():
df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"])
expected_value = 1.632993161855452

result = df.agg(np.std)
expected = Series({"A": 2.0, "B": 2.0}, dtype=float)
expected = Series({"A": expected_value, "B": expected_value}, dtype=float)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Different ddof here will be import to communicate in a possible deprecation.

tm.assert_series_equal(result, expected)

result = df.agg([np.std])
expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
expected = DataFrame({"A": expected_value, "B": expected_value}, index=["std"])
tm.assert_frame_equal(result, expected)


Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,10 @@ def test__cython_agg_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)

result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
kwargs = {"ddof": 0} if op == "var" else {}
result = df.groupby(labels)._cython_agg_general(
op, alt=None, numeric_only=True, **kwargs
)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)

Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,17 +1072,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):

# Check output when no other methods are called before .apply()
grp = df.groupby(by="a")
msg = "The behavior of DataFrame.sum with axis=None is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
result = grp.apply(sum)
result = grp.apply(lambda x: x.sum())
tm.assert_frame_equal(result, expected)

# Check output when another method is called before .apply()
grp = df.groupby(by="a")
args = get_groupby_method_args(reduction_func, df)
_ = getattr(grp, reduction_func)(*args)
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
result = grp.apply(sum)
result = grp.apply(lambda x: x.sum())
tm.assert_frame_equal(result, expected)


Expand Down
16 changes: 5 additions & 11 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,8 +1234,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
).sortlevel()

expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
if operation == "agg":
expected = expected.fillna(0, downcast="infer")
# if operation == "agg":
# expected = expected.fillna(0, downcast="infer")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
result = getattr(grouped, operation)(sum)
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -1676,21 +1676,15 @@ def test_categorical_transform():
"OnTheWay",
"Waiting",
],
"last_status": [
"Delivered",
"Delivered",
"Delivered",
"OnTheWay",
"OnTheWay",
"Waiting",
],
# max doesn't take into account Categorical dtype
"last_status": "Waiting",
}
)

expected["status"] = expected["status"].astype(delivery_status_type)

# .transform(max) should preserve ordered categoricals
expected["last_status"] = expected["last_status"].astype(delivery_status_type)
# expected["last_status"] = expected["last_status"].astype(delivery_status_type)

tm.assert_frame_equal(result, expected)

Expand Down
25 changes: 15 additions & 10 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_intercept_builtin_sum():

result = grouped.agg(builtins.sum)
result2 = grouped.apply(builtins.sum)
expected = grouped.sum()
expected = Series({0: 1.0, 1: 2.0, 2: np.nan})
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)

Expand All @@ -74,17 +74,18 @@ def test_builtins_apply(keys, f):

fname = f.__name__

warn = None if f is not sum else FutureWarning
msg = "The behavior of DataFrame.sum with axis=None is deprecated"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
if fname == "sum":
with pytest.raises(TypeError, match="unsupported operand type"):
gb.apply(f)
else:
result = gb.apply(f)
ngroups = len(df.drop_duplicates(subset=keys))

assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg
expected = Series({idx: f(group) for idx, group in gb})
expected.index.names = keys if isinstance(keys, list) else [keys]
tm.assert_series_equal(result, expected)

npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
expected = gb.apply(npfunc)
result = gb.apply(npfunc)
expected = gb.apply(lambda x: getattr(x, fname)()).astype(float)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
Expand Down Expand Up @@ -683,7 +684,11 @@ def test_ops_general(op, targop):
df = DataFrame(np.random.randn(1000))
labels = np.random.randint(0, 50, size=1000).astype(float)

result = getattr(df.groupby(labels), op)()
if op in ("std", "var"):
kwargs = {"ddof": 0}
else:
kwargs = {}
result = getattr(df.groupby(labels), op)(**kwargs)
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)

Expand Down
18 changes: 11 additions & 7 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ def test_basic_aggregations(dtype):

tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
tm.assert_series_equal(agged, grouped.mean())
tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())

result = grouped.agg(np.sum)
expected = grouped.sum()
if dtype == "int32":
# NumPy sums int32 to int64
expected = expected.astype("int64")
tm.assert_series_equal(result, expected)

expected = grouped.apply(lambda x: x * x.sum())
transformed = grouped.transform(lambda x: x * x.sum())
Expand Down Expand Up @@ -753,11 +759,8 @@ def test_groupby_as_index_agg(df):
gr = df.groupby(ts)
gr.nth(0) # invokes set_selection_from_grouper internally

msg = "The behavior of DataFrame.sum with axis=None is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
res = gr.apply(sum)
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
alt = df.groupby(ts).apply(sum)
res = gr.apply(np.sum)
alt = df.groupby(ts).apply(np.sum)
tm.assert_frame_equal(res, alt)

for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
Expand Down Expand Up @@ -923,9 +926,10 @@ def test_raises_on_nuisance(df):
df = df.loc[:, ["A", "C", "D"]]
df["E"] = datetime.now()
grouped = df.groupby("A")
msg = "datetime64 type does not support sum operations"
msg = "does not support reduction 'sum'"
with pytest.raises(TypeError, match=msg):
grouped.agg(np.sum)
msg = "datetime64 type does not support sum operations"
with pytest.raises(TypeError, match=msg):
grouped.sum()

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def test_groupby_raises_datetime_np(
gb = gb["d"]

klass, msg = {
np.sum: (TypeError, "datetime64 type does not support sum operations"),
np.sum: (TypeError, "does not support reduction"),
np.mean: (None, ""),
}[groupby_func_np]

Expand Down Expand Up @@ -519,10 +519,10 @@ def test_groupby_raises_category_np(
gb = gb["d"]

klass, msg = {
np.sum: (TypeError, "category type does not support sum operations"),
np.sum: (TypeError, "category does not support reduction"),
np.mean: (
TypeError,
"category dtype does not support aggregation 'mean'",
"category does not support reduction",
),
}[groupby_func_np]

Expand Down
20 changes: 10 additions & 10 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,22 +471,22 @@ def test_series_fast_transform_date():
tm.assert_series_equal(result, expected)


def test_transform_length():
@pytest.mark.parametrize("op", [sum, np.nansum])
def test_transform_length(op):
# GH 9697
df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]})
expected = Series([3.0] * 4)

def nsum(x):
return np.nansum(x)
if op is sum:
values = [3.0, 3.0, np.nan, np.nan]
else:
values = [3.0, 3.0, 3.0, 3.0]
expected = Series(values, name="col2")

results = [
df.groupby("col1").transform(sum)["col2"],
df.groupby("col1")["col2"].transform(sum),
df.groupby("col1").transform(nsum)["col2"],
df.groupby("col1")["col2"].transform(nsum),
df.groupby("col1").transform(op)["col2"],
df.groupby("col1")["col2"].transform(op),
]
for result in results:
tm.assert_series_equal(result, expected, check_names=False)
tm.assert_series_equal(result, expected)


def test_transform_coercion():
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,19 +382,21 @@ def test_agg():
]

a_mean = r["A"].mean()
a_std = r["A"].std()
a_std_ddof0 = r["A"].std(ddof=0)
a_std_ddof1 = r["A"].std(ddof=1)
a_sum = r["A"].sum()
b_mean = r["B"].mean()
b_std = r["B"].std()
b_std_ddof0 = r["B"].std(ddof=0)
b_std_ddof1 = r["B"].std(ddof=1)
b_sum = r["B"].sum()

expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected = pd.concat([a_mean, a_std_ddof0, b_mean, b_std_ddof0], axis=1)
expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
for t in cases:
# In case 2, "date" is an index and a column, so get included in the agg
if t == cases[2]:
date_mean = t["date"].mean()
date_std = t["date"].std()
date_std = t["date"].std(ddof=0)
exp = pd.concat([date_mean, date_std, expected], axis=1)
exp.columns = pd.MultiIndex.from_product(
[["date", "A", "B"], ["mean", "std"]]
Expand All @@ -405,7 +407,7 @@ def test_agg():
result = t.aggregate([np.mean, np.std])
tm.assert_frame_equal(result, expected)

expected = pd.concat([a_mean, b_std], axis=1)
expected = pd.concat([a_mean, b_std_ddof0], axis=1)
for t in cases:
result = t.aggregate({"A": np.mean, "B": np.std})
tm.assert_frame_equal(result, expected, check_like=True)
Expand All @@ -416,7 +418,7 @@ def test_agg():
result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std))
tm.assert_frame_equal(result, expected, check_like=True)

expected = pd.concat([a_mean, a_std], axis=1)
expected = pd.concat([a_mean, a_std_ddof1], axis=1)
expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
for t in cases:
result = t.aggregate({"A": ["mean", "std"]})
Expand Down Expand Up @@ -449,7 +451,7 @@ def test_agg():
}
)

expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected = pd.concat([a_mean, a_std_ddof1, b_mean, b_std_ddof1], axis=1)
expected.columns = pd.MultiIndex.from_tuples(
[("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]
)
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2023,10 +2023,7 @@ def test_pivot_string_as_func(self):
[
("sum", np.sum),
("mean", np.mean),
("std", np.std),
(["sum", "mean"], [np.sum, np.mean]),
(["sum", "std"], [np.sum, np.std]),
(["std", "mean"], [np.std, np.mean]),
],
)
def test_pivot_string_func_vs_func(self, f, f_numpy, data):
Expand All @@ -2035,6 +2032,7 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data):
data = data.drop(columns="C")
result = pivot_table(data, index="A", columns="B", aggfunc=f)
expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy)

tm.assert_frame_equal(result, expected)

@pytest.mark.slow
Expand Down
Loading