Skip to content
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,18 @@ def time_division(self, fill_value):
self.arr1 / self.arr2


class MinMax:

params = (["min", "max"], [0.0, np.nan])
param_names = ["func", "fill_value"]

def setup(self, func, fill_value):
N = 1_000_000
arr = make_array(N, 1e-5, fill_value, np.float64)
self.sp_arr = SparseArray(arr, fill_value=fill_value)

def time_min_max(self, func, fill_value):
getattr(self.sp_arr, func)()


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ Performance improvements
- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -437,6 +438,7 @@ Reshaping
Sparse
^^^^^^
- Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`)
- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
-
-

Expand Down
66 changes: 56 additions & 10 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1456,23 +1456,69 @@ def mean(self, axis=0, *args, **kwargs):
nsparse = self.sp_index.ngaps
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

def max(self, axis=0, *args, **kwargs):
def max(self, axis: int = 0, *args, **kwargs) -> Scalar:
"""
Max of non-NA/null values

Parameters
----------
axis : int, default 0
Not Used. NumPy compatibility.
*args, **kwargs
Not Used. NumPy compatibility.

Returns
-------
scalar
"""
nv.validate_max(args, kwargs)
return self._min_max("max")

# This condition returns a nan if there are no valid values in the array.
if self.size > 0 and self._valid_sp_values.size == 0:
return self.fill_value
else:
return np.nanmax(self, axis)
def min(self, axis: int = 0, *args, **kwargs) -> Scalar:
"""
Min of non-NA/null values

Parameters
----------
axis : int, default 0
Not Used. NumPy compatibility.
*args, **kwargs
Not Used. NumPy compatibility.

def min(self, axis=0, *args, **kwargs):
Returns
-------
scalar
"""
nv.validate_min(args, kwargs)
return self._min_max("min")

def _min_max(self, kind: Literal["min", "max"]) -> Scalar:
"""
Min/max of non-NA/null values

# This condition returns a nan if there are no valid values in the array.
if self.size > 0 and self._valid_sp_values.size == 0:
Parameters
----------
kind : {"min", "max"}

Returns
-------
scalar
"""
valid_vals = self._valid_sp_values
has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
if len(valid_vals) > 0:
sp_min_max = getattr(valid_vals, kind)()

# If a non-null fill value is currently present, it might be the min/max
if has_nonnull_fill_vals:
func = max if kind == "max" else min
return func(sp_min_max, self.fill_value)
else:
return sp_min_max
elif has_nonnull_fill_vals:
return self.fill_value
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might has well just do elif else

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, updated

return np.nanmin(self, axis)
return na_value_for_dtype(self.dtype.subtype)

# ------------------------------------------------------------------------
# Ufuncs
Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,26 +1362,54 @@ def test_drop_duplicates_fill_value():


class TestMinMax:
plain_data = np.arange(5).astype(float)
data_neg = plain_data * (-1)
data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4]))
data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan]))
data_NA_filled = SparseArray(
np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5
)

@pytest.mark.parametrize(
"raw_data,max_expected,min_expected",
[
(plain_data, [4], [0]),
(data_neg, [0], [-4]),
(data_NaN, [4], [0]),
(data_all_NaN, [np.nan], [np.nan]),
(data_NA_filled, [5], [5]),
(np.arange(5.0), [4], [0]),
(-np.arange(5.0), [0], [-4]),
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
(np.array([np.nan] * 5), [np.nan], [np.nan]),
(np.array([]), [np.nan], [np.nan]),
],
)
def test_maxmin(self, raw_data, max_expected, min_expected):
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
max_result = SparseArray(raw_data).max()
min_result = SparseArray(raw_data).min()
assert max_result in max_expected
assert min_result in min_expected

@pytest.mark.parametrize(
"fill_value,max_expected,min_expected",
[
(100, 100, 0),
(-100, 1, -100),
],
)
def test_fill_value(self, fill_value, max_expected, min_expected):
arr = SparseArray(
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
)
max_result = arr.max()
assert max_result == max_expected

min_result = arr.min()
assert min_result == min_expected

@pytest.mark.parametrize("func", ["min", "max"])
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
@pytest.mark.parametrize(
"dtype,expected",
[
(SparseDtype(np.float64, np.nan), np.nan),
(SparseDtype(np.float64, 5.0), np.nan),
(SparseDtype("datetime64[ns]", pd.NaT), pd.NaT),
(SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT),
],
)
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
arr = SparseArray(data, dtype=dtype)
result = getattr(arr, func)()
if expected == pd.NaT:
assert result == pd.NaT
else:
assert np.isnan(result)