-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: better dtype inference when doing DataFrame reductions #52788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1e7e563 6397977 0e797b9 b846e70 76ce594 7644598 51da9ef 8d925cd d7d1989 54bcb60 03b8ce4 e0af36f 64d8d60 a95e5b9 e7a75e4 2e64191 b6c1dc8 32f9a73 8bf7ba8 35b07c5 6a390d4 8dc2acf 5a65c70 9cb34ec 8521f18 82cd91e e0bc63e 7cf26ae 6330840 efae9dc b585f3b 52763ab d4f2a84 7bfe3fe f48ea09 bbd8cb8 c6e9a80 5200896 26d4059 b6bd75e 44dcdce 3ebcbff 99d034e 79df9db d01fc1d bc582f6 a7fd1b1 1781d30 68fd316 8ceb57d 4375cb2 f7b354c f91c6ca 9a881fa 9d50f85 026696f f603de0 a7e69ad 772998f b20a289 082ddd9 8032514 3a3ec95 77992f7 23f22fb 3b8d8f0 1e39b65 1ed3e2d 467073a dd0bfe8 49334c7 5634106 f85deab 6519712 74410f6 e7503dc 24e2d11 e3afa18 899a2fb File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -32,6 +32,10 @@ | |
| Shape, | ||
| npt, | ||
| ) | ||
| from pandas.compat import ( | ||
| IS64, | ||
| is_platform_windows, | ||
| ) | ||
| from pandas.errors import AbstractMethodError | ||
| from pandas.util._decorators import doc | ||
| from pandas.util._validators import validate_fillna_kwargs | ||
| | @@ -1081,21 +1085,31 @@ def _quantile( | |
| # ------------------------------------------------------------------ | ||
| # Reductions | ||
| | ||
| def _reduce(self, name: str, *, skipna: bool = True, **kwargs): | ||
| def _reduce( | ||
| self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs | ||
| ): | ||
| if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}: | ||
| return getattr(self, name)(skipna=skipna, **kwargs) | ||
| | ||
| data = self._data | ||
| mask = self._mask | ||
| | ||
| # median, skew, kurt, sem | ||
| op = getattr(nanops, f"nan{name}") | ||
| result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) | ||
| result = getattr(self, name)(skipna=skipna, **kwargs) | ||
| else: | ||
| # median, skew, kurt, sem | ||
| data = self._data | ||
| mask = self._mask | ||
| op = getattr(nanops, f"nan{name}") | ||
| axis = kwargs.pop("axis", None) | ||
| result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs) | ||
| | ||
| if keepdims: | ||
| if isna(result): | ||
| return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) | ||
| else: | ||
| result = result.reshape(1) | ||
| mask = np.zeros(1, dtype=bool) | ||
| return self._maybe_mask_result(result, mask) | ||
| | ||
| if np.isnan(result): | ||
| if isna(result): | ||
| return libmissing.NA | ||
| | ||
| return result | ||
| else: | ||
| return result | ||
| | ||
| def _wrap_reduction_result(self, name: str, result, *, skipna, axis): | ||
| if isinstance(result, np.ndarray): | ||
| | @@ -1108,6 +1122,32 @@ def _wrap_reduction_result(self, name: str, result, *, skipna, axis): | |
| return self._maybe_mask_result(result, mask) | ||
| return result | ||
| | ||
| def _wrap_na_result(self, *, name, axis, mask_size): | ||
| mask = np.ones(mask_size, dtype=bool) | ||
| | ||
| float_dtyp = "float32" if self.dtype == "Float32" else "float64" | ||
| if name in ["mean", "median", "var", "std", "skew"]: | ||
| np_dtype = float_dtyp | ||
| elif name in ["min", "max"] or self.dtype.itemsize == 8: | ||
| np_dtype = self.dtype.numpy_dtype.name | ||
| else: | ||
| is_windows_or_32bit = is_platform_windows() or not IS64 | ||
| int_dtyp = "int32" if is_windows_or_32bit else "int64" | ||
| uint_dtyp = "uint32" if is_windows_or_32bit else "uint64" | ||
| np_dtype = {"b": int_dtyp, "i": int_dtyp, "u": uint_dtyp, "f": float_dtyp}[ | ||
| self.dtype.kind | ||
| ] | ||
| | ||
| value = np.array([1], dtype=np_dtype) | ||
| return self._maybe_mask_result(value, mask=mask) | ||
| | ||
| def _wrap_min_count_reduction_result( | ||
| self, name: str, result, *, skipna, min_count, axis | ||
| ): | ||
| if min_count == 0 and isinstance(result, np.ndarray): | ||
| return self._maybe_mask_result(result, np.zeros(result.shape, dtype=bool)) | ||
| return self._wrap_reduction_result(name, result, skipna=skipna, axis=axis) | ||
| | ||
| def sum( | ||
| self, | ||
| *, | ||
| | @@ -1125,7 +1165,9 @@ def sum( | |
| min_count=min_count, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("sum", result, skipna=skipna, axis=axis) | ||
| return self._wrap_min_count_reduction_result( | ||
| "sum", result, skipna=skipna, min_count=min_count, axis=axis | ||
| ) | ||
| | ||
| def prod( | ||
| self, | ||
| | @@ -1136,14 +1178,17 @@ def prod( | |
| **kwargs, | ||
| ): | ||
| nv.validate_prod((), kwargs) | ||
| | ||
| result = masked_reductions.prod( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| min_count=min_count, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("prod", result, skipna=skipna, axis=axis) | ||
| return self._wrap_min_count_reduction_result( | ||
| "prod", result, skipna=skipna, min_count=min_count, axis=axis | ||
| ) | ||
| | ||
| def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_mean((), kwargs) | ||
| | @@ -1183,23 +1228,25 @@ def std( | |
| | ||
| def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_min((), kwargs) | ||
| return masked_reductions.min( | ||
| result = masked_reductions.min( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("min", result, skipna=skipna, axis=axis) | ||
| | ||
| def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| nv.validate_max((), kwargs) | ||
| return masked_reductions.max( | ||
| result = masked_reductions.max( | ||
| self._data, | ||
| self._mask, | ||
| skipna=skipna, | ||
| axis=axis, | ||
| ) | ||
| return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) | ||
| | ||
| def any(self, *, skipna: bool = True, **kwargs): | ||
| def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it needed to add the Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll look into it, could be connected to your previous comment. Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this works, and I've made another version, will if it passes, and then I'll look into your other comments | ||
| """ | ||
| Return whether any element is truthy. | ||
| | ||
| | @@ -1218,6 +1265,7 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| If `skipna` is False, the result will still be True if there is | ||
| at least one element that is truthy, otherwise NA will be returned | ||
| if there are NA's present. | ||
| axis : int, optional, default 0 | ||
| **kwargs : any, default None | ||
| Additional keywords have no effect but might be accepted for | ||
| compatibility with NumPy. | ||
| | @@ -1261,7 +1309,6 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| >>> pd.array([0, 0, pd.NA]).any(skipna=False) | ||
| <NA> | ||
| """ | ||
| kwargs.pop("axis", None) | ||
| nv.validate_any((), kwargs) | ||
| | ||
| values = self._data.copy() | ||
| | @@ -1280,7 +1327,7 @@ def any(self, *, skipna: bool = True, **kwargs): | |
| else: | ||
| return self.dtype.na_value | ||
| | ||
| def all(self, *, skipna: bool = True, **kwargs): | ||
| def all(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): | ||
| """ | ||
| Return whether all elements are truthy. | ||
| | ||
| | @@ -1299,6 +1346,7 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| If `skipna` is False, the result will still be False if there is | ||
| at least one element that is falsey, otherwise NA will be returned | ||
| if there are NA's present. | ||
| axis : int, optional, default 0 | ||
| **kwargs : any, default None | ||
| Additional keywords have no effect but might be accepted for | ||
| compatibility with NumPy. | ||
| | @@ -1342,7 +1390,6 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| >>> pd.array([1, 0, pd.NA]).all(skipna=False) | ||
| False | ||
| """ | ||
| kwargs.pop("axis", None) | ||
| nv.validate_all((), kwargs) | ||
| | ||
| values = self._data.copy() | ||
| | @@ -1352,7 +1399,7 @@ def all(self, *, skipna: bool = True, **kwargs): | |
| # bool, int, float, complex, str, bytes, | ||
| # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" | ||
| np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] | ||
| result = values.all() | ||
| result = values.all(axis=axis) | ||
| | ||
| if skipna: | ||
| return result | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that now you are using
subsetagain on this line, passing thisaxisis not doing anything (and would actually raise an error if you would passaxis=1here)(it doesn't really matter in practice because we never call this with an axis=1, but seeing
axispassed through might give the false impression that this algo actually supports 2D data, while that is not the case)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@topper-123 can you address this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I thought I had answered this, apparently not...
funchere is eithernp.minornp.max, so supplyingaxis=axiswill not raise here, but will work as expected AFAIKS.Additionally, without the
axis=axispart,func(subset)is similar tonp.max|min(subset, axis=None). Not a problem for 1d arrays, but will be a problem if we ever want to supportdf.min(axis=None)using 2d masked arrays. (I'm not sure we want to support 2d masked arrays or are going all in on arrow?)