pandas-dev · mroeschke · Jul 13, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -126,10 +126,11 @@ These dtypes can be merged, reshaped & casted.
  pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes
  df["A"].astype(float)
 
-Reduction and groupby operations such as 'sum' work as well.
+Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well.
 
 .. ipython:: python
 
+ df.sum(numeric_only=True)
  df.sum()
  df.groupby("B").A.sum()
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -14,6 +14,46 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_210.enhancements.reduction_extension_dtypes:
+
+DataFrame reductions preserve extension dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, the results of DataFrame reductions
+(:meth:`DataFrame.sum` :meth:`DataFrame.mean` etc.) had numpy dtypes, even when the DataFrames
+were of extension dtypes. Pandas can now keep the dtypes when doing reductions over Dataframe
+columns with a common dtype (:issue:`52788`).
+
+*Old Behavior*
+
+.. code-block:: ipython
+
+ In [1]: df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+ In [2]: df.sum()
+ Out[2]:
+ a 5
+ b 9
+ dtype: int64
+ In [3]: df = df.astype("int64[pyarrow]")
+ In [4]: df.sum()
+ Out[4]:
+ a 5
+ b 9
+ dtype: int64
+
+*New Behavior*
+
+.. ipython:: python
+
+ df = pd.DataFrame({"a": [1, 1, 2, 1], "b": [np.nan, 2.0, 3.0, 4.0]}, dtype="Int64")
+ df.sum()
+ df = df.astype("int64[pyarrow]")
+ df.sum()
+
+Notice that the dtype is now a masked dtype and pyarrow dtype, respectively, while previously it was a numpy integer dtype.
+
+To allow Dataframe reductions to preserve extension dtypes, :meth:`ExtensionArray._reduce` has gotten a new keyword parameter ``keepdims``. Calling :meth:`ExtensionArray._reduce` with ``keepdims=True`` should return an array of length 1 along the reduction axis. In order to maintain backward compatibility, the parameter is not required, but will it become required in the future. If the parameter is not found in the signature, DataFrame reductions can not preserve extension dtypes. Also, if the parameter is not found, a ``FutureWarning`` will be emitted and type checkers like mypy may complain about the signature not being compatible with :meth:`ExtensionArray._reduce`.
+
 .. _whatsnew_210.enhancements.cow:
 
 Copy-on-Write improvements

diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -52,7 +52,7 @@ def _reductions(
  axis : int, optional, default None
  """
  if not skipna:
- if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
+ if mask.any() or check_below_min_count(values.shape, None, min_count):
  return libmissing.NA
  else:
  return func(values, axis=axis, **kwargs)
@@ -119,11 +119,11 @@ def _minmax(
  # min/max with empty array raise in numpy, pandas returns NA
  return libmissing.NA
  else:
- return func(values)
+ return func(values, axis=axis)
  else:
  subset = values[~mask]
  if subset.size:
- return func(subset)
+ return func(subset, axis=axis)
  else:
  # min/max with empty array raise in numpy, pandas returns NA
  return libmissing.NA

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1508,7 +1508,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
 
  return result
 
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+ def _reduce(
+ self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+ ):
  """
  Return a scalar result of performing the reduction operation.
 
@@ -1532,12 +1534,16 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  ------
  TypeError : subclass does not define reductions
  """
- result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
+ pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
 
- if pc.is_null(result).as_py():
- return self.dtype.na_value
+ if keepdims:
+ result = pa.array([pa_result.as_py()], type=pa_result.type)
+ return type(self)(result)
 
- return result.as_py()
+ if pc.is_null(pa_result).as_py():
+ return self.dtype.na_value
+ else:
+ return pa_result.as_py()
 
  def _explode(self):
  """

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1447,7 +1447,9 @@ def _accumulate(
  """
  raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
 
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+ def _reduce(
+ self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+ ):
  """
  Return a scalar result of performing the reduction operation.
 
@@ -1459,6 +1461,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  std, var, sem, kurt, skew }.
  skipna : bool, default True
  If True, skip NaN values.
+ keepdims : bool, default False
+ If False, a scalar is returned.
+ If True, the result has dimension with size one along the reduced axis.
+
+ .. versionadded:: 2.1
+
+ This parameter is not required in the _reduce signature to keep backward
+ compatibility, but will become required in the future. If the parameter
+ is not found in the method signature, a FutureWarning will be emitted.
  **kwargs
  Additional keyword arguments passed to the reduction function.
  Currently, `ddof` is the only supported kwarg.
@@ -1477,7 +1488,11 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  f"'{type(self).__name__}' with dtype {self.dtype} "
  f"does not support reduction '{name}'"
  )
- return meth(skipna=skipna, **kwargs)
+ result = meth(skipna=skipna, **kwargs)
+ if keepdims:
+ result = np.array([result])
+
+ return result
 
  # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
  # Incompatible types in assignment (expression has type "None", base class

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2319,6 +2319,15 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
  # ------------------------------------------------------------------
  # Reductions
 
+ def _reduce(
+ self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+ ):
+ result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+ if keepdims:
+ return type(self)(result, dtype=self.dtype)
+ else:
+ return result
+
  def min(self, *, skipna: bool = True, **kwargs):
  """
  The minimum value of the object.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -32,6 +32,10 @@
  Shape,
  npt,
 )
+from pandas.compat import (
+ IS64,
+ is_platform_windows,
+)
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
@@ -1081,21 +1085,31 @@ def _quantile(
  # ------------------------------------------------------------------
  # Reductions
 
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+ def _reduce(
+ self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+ ):
  if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
- return getattr(self, name)(skipna=skipna, **kwargs)
-
- data = self._data
- mask = self._mask
-
- # median, skew, kurt, sem
- op = getattr(nanops, f"nan{name}")
- result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
+ result = getattr(self, name)(skipna=skipna, **kwargs)
+ else:
+ # median, skew, kurt, sem
+ data = self._data
+ mask = self._mask
+ op = getattr(nanops, f"nan{name}")
+ axis = kwargs.pop("axis", None)
+ result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
+
+ if keepdims:
+ if isna(result):
+ return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
+ else:
+ result = result.reshape(1)
+ mask = np.zeros(1, dtype=bool)
+ return self._maybe_mask_result(result, mask)
 
- if np.isnan(result):
+ if isna(result):
  return libmissing.NA
-
- return result
+ else:
+  return result
 
  def _wrap_reduction_result(self, name: str, result, *, skipna, axis):
  if isinstance(result, np.ndarray):
@@ -1108,6 +1122,32 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis):
  return self._maybe_mask_result(result, mask)
  return result
 
+ def _wrap_na_result(self, *, name, axis, mask_size):
+ mask = np.ones(mask_size, dtype=bool)
+
+ float_dtyp = "float32" if self.dtype == "Float32" else "float64"
+ if name in ["mean", "median", "var", "std", "skew"]:
+ np_dtype = float_dtyp
+ elif name in ["min", "max"] or self.dtype.itemsize == 8:
+ np_dtype = self.dtype.numpy_dtype.name
+ else:
+ is_windows_or_32bit = is_platform_windows() or not IS64
+ int_dtyp = "int32" if is_windows_or_32bit else "int64"
+ uint_dtyp = "uint32" if is_windows_or_32bit else "uint64"
+ np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[
+ self.dtype.kind
+ ]
+
+ value = np.array([1], dtype=np_dtype)
+ return self._maybe_mask_result(value, mask=mask)
+
+ def _wrap_min_count_reduction_result(
+ self, name: str, result, *, skipna, min_count, axis
+ ):
+ if min_count == 0 and isinstance(result, np.ndarray):
+ return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool))
+ return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis)
+
  def sum(
  self,
  *,
@@ -1125,7 +1165,9 @@ def sum(
  min_count=min_count,
  axis=axis,
  )
- return self._wrap_reduction_result("sum", result, skipna=skipna, axis=axis)
+ return self._wrap_min_count_reduction_result(
+ "sum", result, skipna=skipna, min_count=min_count, axis=axis
+ )
 
  def prod(
  self,
@@ -1136,14 +1178,17 @@ def prod(
  **kwargs,
  ):
  nv.validate_prod((), kwargs)
+
  result = masked_reductions.prod(
  self._data,
  self._mask,
  skipna=skipna,
  min_count=min_count,
  axis=axis,
  )
- return self._wrap_reduction_result("prod", result, skipna=skipna, axis=axis)
+ return self._wrap_min_count_reduction_result(
+ "prod", result, skipna=skipna, min_count=min_count, axis=axis
+ )
 
  def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
  nv.validate_mean((), kwargs)
@@ -1183,23 +1228,25 @@ def std(
 
  def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
  nv.validate_min((), kwargs)
- return masked_reductions.min(
+ result = masked_reductions.min(
  self._data,
  self._mask,
  skipna=skipna,
  axis=axis,
  )
+ return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis)
 
  def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
  nv.validate_max((), kwargs)
- return masked_reductions.max(
+ result = masked_reductions.max(
  self._data,
  self._mask,
  skipna=skipna,
  axis=axis,
  )
+ return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis)
 
- def any(self, *, skipna: bool = True, **kwargs):
+ def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
  """
  Return whether any element is truthy.
 
@@ -1218,6 +1265,7 @@ def any(self, *, skipna: bool = True, **kwargs):
  If `skipna` is False, the result will still be True if there is
  at least one element that is truthy, otherwise NA will be returned
  if there are NA's present.
+ axis : int, optional, default 0
  **kwargs : any, default None
  Additional keywords have no effect but might be accepted for
  compatibility with NumPy.
@@ -1261,7 +1309,6 @@ def any(self, *, skipna: bool = True, **kwargs):
  >>> pd.array([0, 0, pd.NA]).any(skipna=False)
  <NA>
  """
- kwargs.pop("axis", None)
  nv.validate_any((), kwargs)
 
  values = self._data.copy()
@@ -1280,7 +1327,7 @@ def any(self, *, skipna: bool = True, **kwargs):
  else:
  return self.dtype.na_value
 
- def all(self, *, skipna: bool = True, **kwargs):
+ def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
  """
  Return whether all elements are truthy.
 
@@ -1299,6 +1346,7 @@ def all(self, *, skipna: bool = True, **kwargs):
  If `skipna` is False, the result will still be False if there is
  at least one element that is falsey, otherwise NA will be returned
  if there are NA's present.
+ axis : int, optional, default 0
  **kwargs : any, default None
  Additional keywords have no effect but might be accepted for
  compatibility with NumPy.
@@ -1342,7 +1390,6 @@ def all(self, *, skipna: bool = True, **kwargs):
  >>> pd.array([1, 0, pd.NA]).all(skipna=False)
  False
  """
- kwargs.pop("axis", None)
  nv.validate_all((), kwargs)
 
  values = self._data.copy()
@@ -1352,7 +1399,7 @@ def all(self, *, skipna: bool = True, **kwargs):
  # bool, int, float, complex, str, bytes,
  # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
  np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
- result = values.all()
+ result = values.all(axis=axis)
 
  if skipna:
  return result

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1384,7 +1384,9 @@ def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
  # Reductions
  # ------------------------------------------------------------------------
 
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
+ def _reduce(
+ self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+ ):
  method = getattr(self, name, None)
 
  if method is None:
@@ -1395,7 +1397,12 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
  else:
  arr = self.dropna()
 
- return getattr(arr, name)(**kwargs)
+ result = getattr(arr, name)(**kwargs)
+
+ if keepdims:
+ return type(self)([result], dtype=self.dtype)
+ else:
+ return result
 
  def all(self, axis=None, *args, **kwargs):
  """