POC: Don't special case Python builtin and NumPy functions

pandas-dev · rhshadrach · May 28, 2023 · May 28, 2023 · May 28, 2023 · 67c1c8dd59e35637297e37d6ac3db42213cad444
commit 67c1c8dd59e35637297e37d6ac3db42213cad444
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1105,10 +1105,14 @@ def agg(self):
  # we cannot FIRST try the vectorized evaluation, because
  # then .agg and .apply would have different semantics if the
  # operation is actually defined on the Series, e.g. str
- try:
- result = self.obj.apply(f)
- except (ValueError, AttributeError, TypeError):
+ has_cython_func = f in com._orig_cython_table
+ if has_cython_func and not self.args and not self.kwargs:
  result = f(self.obj)
+ else:
+ try:
+ result = self.obj.apply(f)
+ except (ValueError, AttributeError, TypeError):
+ result = f(self.obj)
 
  return result
 

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -560,12 +560,46 @@ def require_length_match(data, index: Index) -> None:
 # whereas np.min and np.max (which directly call obj.min and obj.max)
 # default to axis=None.
 _builtin_table = {
+ # builtins.sum: np.sum,
+ # builtins.max: np.maximum.reduce,
+ # builtins.min: np.minimum.reduce,
+}
+
+_orig_builtin_table = {
  builtins.sum: np.sum,
  builtins.max: np.maximum.reduce,
  builtins.min: np.minimum.reduce,
 }
 
 _cython_table = {
+ # builtins.sum: "sum",
+ # builtins.max: "max",
+ # builtins.min: "min",
+ # np.all: "all",
+ # np.any: "any",
+ # np.sum: "sum",
+ # np.nansum: "sum",
+ # np.mean: "mean",
+ # np.nanmean: "mean",
+ # np.prod: "prod",
+ # np.nanprod: "prod",
+ # np.std: "std",
+ # np.nanstd: "std",
+ # np.var: "var",
+ # np.nanvar: "var",
+ # np.median: "median",
+ # np.nanmedian: "median",
+ # np.max: "max",
+ # np.nanmax: "max",
+ # np.min: "min",
+ # np.nanmin: "min",
+ # np.cumprod: "cumprod",
+ # np.nancumprod: "cumprod",
+ # np.cumsum: "cumsum",
+ # np.nancumsum: "cumsum",
+}
+
+_orig_cython_table = {
  builtins.sum: "sum",
  builtins.max: "max",
  builtins.min: "min",

diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -1502,13 +1502,14 @@ def foo2(x, b=2, c=0):
 
 def test_agg_std():
  df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"])
+ expected_value = 1.632993161855452
 
  result = df.agg(np.std)
- expected = Series({"A": 2.0, "B": 2.0}, dtype=float)
+ expected = Series({"A": expected_value, "B": expected_value}, dtype=float)
  tm.assert_series_equal(result, expected)
 
  result = df.agg([np.std])
- expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
+ expected = DataFrame({"A": expected_value, "B": expected_value}, index=["std"])
  tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -181,7 +181,10 @@ def test__cython_agg_general(op, targop):
  df = DataFrame(np.random.randn(1000))
  labels = np.random.randint(0, 50, size=1000).astype(float)
 
- result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
+ kwargs = {"ddof": 0} if op == "var" else {}
+ result = df.groupby(labels)._cython_agg_general(
+ op, alt=None, numeric_only=True, **kwargs
+ )
  expected = df.groupby(labels).agg(targop)
  tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1072,17 +1072,14 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
 
  # Check output when no other methods are called before .apply()
  grp = df.groupby(by="a")
- msg = "The behavior of DataFrame.sum with axis=None is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
- result = grp.apply(sum)
+ result = grp.apply(lambda x: x.sum())
  tm.assert_frame_equal(result, expected)
 
  # Check output when another method is called before .apply()
  grp = df.groupby(by="a")
  args = get_groupby_method_args(reduction_func, df)
  _ = getattr(grp, reduction_func)(*args)
- with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
- result = grp.apply(sum)
+ result = grp.apply(lambda x: x.sum())
  tm.assert_frame_equal(result, expected)
 
 

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1234,8 +1234,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
  ).sortlevel()
 
  expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
- if operation == "agg":
- expected = expected.fillna(0, downcast="infer")
+ # if operation == "agg":
+ #  expected = expected.fillna(0, downcast="infer")
  grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
  result = getattr(grouped, operation)(sum)
  tm.assert_series_equal(result, expected)
@@ -1676,21 +1676,15 @@ def test_categorical_transform():
  "OnTheWay",
  "Waiting",
  ],
- "last_status": [
- "Delivered",
- "Delivered",
- "Delivered",
- "OnTheWay",
- "OnTheWay",
- "Waiting",
- ],
+ # max doesn't take into account Categorical dtype
+ "last_status": "Waiting",
  }
  )
 
  expected["status"] = expected["status"].astype(delivery_status_type)
 
  # .transform(max) should preserve ordered categoricals
- expected["last_status"] = expected["last_status"].astype(delivery_status_type)
+ # expected["last_status"] = expected["last_status"].astype(delivery_status_type)
 
  tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -58,7 +58,7 @@ def test_intercept_builtin_sum():
 
  result = grouped.agg(builtins.sum)
  result2 = grouped.apply(builtins.sum)
- expected = grouped.sum()
+ expected = Series({0: 1.0, 1: 2.0, 2: np.nan})
  tm.assert_series_equal(result, expected)
  tm.assert_series_equal(result2, expected)
 
@@ -74,17 +74,18 @@ def test_builtins_apply(keys, f):
 
  fname = f.__name__
 
- warn = None if f is not sum else FutureWarning
- msg = "The behavior of DataFrame.sum with axis=None is deprecated"
- with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
+ if fname == "sum":
+ with pytest.raises(TypeError, match="unsupported operand type"):
+ gb.apply(f)
+ else:
  result = gb.apply(f)
- ngroups = len(df.drop_duplicates(subset=keys))
-
- assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
- assert result.shape == (ngroups, 3), assert_msg
+ expected = Series({idx: f(group) for idx, group in gb})
+ expected.index.names = keys if isinstance(keys, list) else [keys]
+ tm.assert_series_equal(result, expected)
 
  npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
- expected = gb.apply(npfunc)
+ result = gb.apply(npfunc)
+ expected = gb.apply(lambda x: getattr(x, fname)()).astype(float)
  tm.assert_frame_equal(result, expected)
 
  with tm.assert_produces_warning(None):
@@ -683,7 +684,11 @@ def test_ops_general(op, targop):
  df = DataFrame(np.random.randn(1000))
  labels = np.random.randint(0, 50, size=1000).astype(float)
 
- result = getattr(df.groupby(labels), op)()
+ if op in ("std", "var"):
+ kwargs = {"ddof": 0}
+ else:
+ kwargs = {}
+ result = getattr(df.groupby(labels), op)(**kwargs)
  expected = df.groupby(labels).agg(targop)
  tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -82,7 +82,13 @@ def test_basic_aggregations(dtype):
 
  tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  tm.assert_series_equal(agged, grouped.mean())
- tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())
+
+ result = grouped.agg(np.sum)
+ expected = grouped.sum()
+ if dtype == "int32":
+ # NumPy sums int32 to int64
+ expected = expected.astype("int64")
+ tm.assert_series_equal(result, expected)
 
  expected = grouped.apply(lambda x: x * x.sum())
  transformed = grouped.transform(lambda x: x * x.sum())
@@ -753,11 +759,8 @@ def test_groupby_as_index_agg(df):
  gr = df.groupby(ts)
  gr.nth(0) # invokes set_selection_from_grouper internally
 
- msg = "The behavior of DataFrame.sum with axis=None is deprecated"
- with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
- res = gr.apply(sum)
- with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
- alt = df.groupby(ts).apply(sum)
+ res = gr.apply(np.sum)
+ alt = df.groupby(ts).apply(np.sum)
  tm.assert_frame_equal(res, alt)
 
  for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
@@ -923,9 +926,10 @@ def test_raises_on_nuisance(df):
  df = df.loc[:, ["A", "C", "D"]]
  df["E"] = datetime.now()
  grouped = df.groupby("A")
- msg = "datetime64 type does not support sum operations"
+ msg = "does not support reduction 'sum'"
  with pytest.raises(TypeError, match=msg):
  grouped.agg(np.sum)
+ msg = "datetime64 type does not support sum operations"
  with pytest.raises(TypeError, match=msg):
  grouped.sum()
 

diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
@@ -329,7 +329,7 @@ def test_groupby_raises_datetime_np(
  gb = gb["d"]
 
  klass, msg = {
- np.sum: (TypeError, "datetime64 type does not support sum operations"),
+ np.sum: (TypeError, "does not support reduction"),
  np.mean: (None, ""),
  }[groupby_func_np]
 
@@ -519,10 +519,10 @@ def test_groupby_raises_category_np(
  gb = gb["d"]
 
  klass, msg = {
- np.sum: (TypeError, "category type does not support sum operations"),
+ np.sum: (TypeError, "category does not support reduction"),
  np.mean: (
  TypeError,
- "category dtype does not support aggregation 'mean'",
+ "category does not support reduction",
  ),
  }[groupby_func_np]
 

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -471,22 +471,22 @@ def test_series_fast_transform_date():
  tm.assert_series_equal(result, expected)
 
 
-def test_transform_length():
+@pytest.mark.parametrize("op", [sum, np.nansum])
+def test_transform_length(op):
  # GH 9697
  df = DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]})
- expected = Series([3.0] * 4)
-
- def nsum(x):
- return np.nansum(x)
+ if op is sum:
+ values = [3.0, 3.0, np.nan, np.nan]
+ else:
+ values = [3.0, 3.0, 3.0, 3.0]
+ expected = Series(values, name="col2")
 
  results = [
- df.groupby("col1").transform(sum)["col2"],
- df.groupby("col1")["col2"].transform(sum),
- df.groupby("col1").transform(nsum)["col2"],
- df.groupby("col1")["col2"].transform(nsum),
+ df.groupby("col1").transform(op)["col2"],
+ df.groupby("col1")["col2"].transform(op),
  ]
  for result in results:
- tm.assert_series_equal(result, expected, check_names=False)
+ tm.assert_series_equal(result, expected)
 
 
 def test_transform_coercion():

diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
@@ -382,19 +382,21 @@ def test_agg():
  ]
 
  a_mean = r["A"].mean()
- a_std = r["A"].std()
+ a_std_ddof0 = r["A"].std(ddof=0)
+ a_std_ddof1 = r["A"].std(ddof=1)
  a_sum = r["A"].sum()
  b_mean = r["B"].mean()
- b_std = r["B"].std()
+ b_std_ddof0 = r["B"].std(ddof=0)
+ b_std_ddof1 = r["B"].std(ddof=1)
  b_sum = r["B"].sum()
 
- expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+ expected = pd.concat([a_mean, a_std_ddof0, b_mean, b_std_ddof0], axis=1)
  expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
  for t in cases:
  # In case 2, "date" is an index and a column, so get included in the agg
  if t == cases[2]:
  date_mean = t["date"].mean()
- date_std = t["date"].std()
+ date_std = t["date"].std(ddof=0)
  exp = pd.concat([date_mean, date_std, expected], axis=1)
  exp.columns = pd.MultiIndex.from_product(
  [["date", "A", "B"], ["mean", "std"]]
@@ -405,7 +407,7 @@ def test_agg():
  result = t.aggregate([np.mean, np.std])
  tm.assert_frame_equal(result, expected)
 
- expected = pd.concat([a_mean, b_std], axis=1)
+ expected = pd.concat([a_mean, b_std_ddof0], axis=1)
  for t in cases:
  result = t.aggregate({"A": np.mean, "B": np.std})
  tm.assert_frame_equal(result, expected, check_like=True)
@@ -416,7 +418,7 @@ def test_agg():
  result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std))
  tm.assert_frame_equal(result, expected, check_like=True)
 
- expected = pd.concat([a_mean, a_std], axis=1)
+ expected = pd.concat([a_mean, a_std_ddof1], axis=1)
  expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
  for t in cases:
  result = t.aggregate({"A": ["mean", "std"]})
@@ -449,7 +451,7 @@ def test_agg():
  }
  )
 
- expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+ expected = pd.concat([a_mean, a_std_ddof1, b_mean, b_std_ddof1], axis=1)
  expected.columns = pd.MultiIndex.from_tuples(
  [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]
  )

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2023,10 +2023,7 @@ def test_pivot_string_as_func(self):
  [
  ("sum", np.sum),
  ("mean", np.mean),
- ("std", np.std),
  (["sum", "mean"], [np.sum, np.mean]),
- (["sum", "std"], [np.sum, np.std]),
- (["std", "mean"], [np.std, np.mean]),
  ],
  )
  def test_pivot_string_func_vs_func(self, f, f_numpy, data):
@@ -2035,6 +2032,7 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data):
  data = data.drop(columns="C")
  result = pivot_table(data, index="A", columns="B", aggfunc=f)
  expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy)
+
  tm.assert_frame_equal(result, expected)
 
  @pytest.mark.slow