pandas-dev · jreback · Sep 12, 2021 · Sep 12, 2021 · Sep 12, 2021 · Sep 12, 2021
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
@@ -166,4 +166,18 @@ def time_division(self, fill_value):
  self.arr1 / self.arr2
 
 
+class MinMax:
+
+ params = (["min", "max"], [0.0, np.nan])
+ param_names = ["func", "fill_value"]
+
+ def setup(self, func, fill_value):
+ N = 1_000_000
+ arr = make_array(N, 1e-5, fill_value, np.float64)
+ self.sp_arr = SparseArray(arr, fill_value=fill_value)
+
+ def time_min_max(self, func, fill_value):
+ getattr(self.sp_arr, func)()
+
+
 from .pandas_vb_common import setup # noqa: F401 isort:skip
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -302,6 +302,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
 - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
 - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
+- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -437,6 +438,7 @@ Reshaping
 Sparse
 ^^^^^^
 - Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`)
+- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`)
 -
 -
 

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1456,23 +1456,69 @@ def mean(self, axis=0, *args, **kwargs):
  nsparse = self.sp_index.ngaps
  return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
 
- def max(self, axis=0, *args, **kwargs):
+ def max(self, axis: int = 0, *args, **kwargs) -> Scalar:
+ """
+ Max of non-NA/null values
+
+ Parameters
+ ----------
+ axis : int, default 0
+ Not Used. NumPy compatibility.
+ *args, **kwargs
+ Not Used. NumPy compatibility.
+
+ Returns
+ -------
+ scalar
+ """
  nv.validate_max(args, kwargs)
+ return self._min_max("max")
 
- # This condition returns a nan if there are no valid values in the array.
- if self.size > 0 and self._valid_sp_values.size == 0:
- return self.fill_value
- else:
- return np.nanmax(self, axis)
+ def min(self, axis: int = 0, *args, **kwargs) -> Scalar:
+ """
+ Min of non-NA/null values
+
+ Parameters
+ ----------
+ axis : int, default 0
+ Not Used. NumPy compatibility.
+ *args, **kwargs
+ Not Used. NumPy compatibility.
 
- def min(self, axis=0, *args, **kwargs):
+ Returns
+ -------
+ scalar
+ """
  nv.validate_min(args, kwargs)
+ return self._min_max("min")
+
+ def _min_max(self, kind: Literal["min", "max"]) -> Scalar:
+ """
+ Min/max of non-NA/null values
 
- # This condition returns a nan if there are no valid values in the array.
- if self.size > 0 and self._valid_sp_values.size == 0:
+ Parameters
+ ----------
+ kind : {"min", "max"}
+
+ Returns
+ -------
+ scalar
+ """
+ valid_vals = self._valid_sp_values
+ has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
+ if len(valid_vals) > 0:
+ sp_min_max = getattr(valid_vals, kind)()
+
+ # If a non-null fill value is currently present, it might be the min/max
+ if has_nonnull_fill_vals:
+ func = max if kind == "max" else min
+ return func(sp_min_max, self.fill_value)
+ else:
+ return sp_min_max
+ elif has_nonnull_fill_vals:
  return self.fill_value
  else:
- return np.nanmin(self, axis)
+ return na_value_for_dtype(self.dtype.subtype)
 
  # ------------------------------------------------------------------------
  # Ufuncs

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -1362,26 +1362,54 @@ def test_drop_duplicates_fill_value():
 
 
 class TestMinMax:
- plain_data = np.arange(5).astype(float)
- data_neg = plain_data * (-1)
- data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4]))
- data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan]))
- data_NA_filled = SparseArray(
- np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5
- )
-
  @pytest.mark.parametrize(
  "raw_data,max_expected,min_expected",
  [
- (plain_data, [4], [0]),
- (data_neg, [0], [-4]),
- (data_NaN, [4], [0]),
- (data_all_NaN, [np.nan], [np.nan]),
- (data_NA_filled, [5], [5]),
+ (np.arange(5.0), [4], [0]),
+ (-np.arange(5.0), [0], [-4]),
+ (np.array([0, 1, 2, np.nan, 4]), [4], [0]),
+ (np.array([np.nan] * 5), [np.nan], [np.nan]),
+ (np.array([]), [np.nan], [np.nan]),
  ],
  )
- def test_maxmin(self, raw_data, max_expected, min_expected):
+ def test_nan_fill_value(self, raw_data, max_expected, min_expected):
  max_result = SparseArray(raw_data).max()
  min_result = SparseArray(raw_data).min()
  assert max_result in max_expected
  assert min_result in min_expected
+
+ @pytest.mark.parametrize(
+ "fill_value,max_expected,min_expected",
+ [
+ (100, 100, 0),
+ (-100, 1, -100),
+ ],
+ )
+ def test_fill_value(self, fill_value, max_expected, min_expected):
+ arr = SparseArray(
+ np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
+ )
+ max_result = arr.max()
+ assert max_result == max_expected
+
+ min_result = arr.min()
+ assert min_result == min_expected
+
+ @pytest.mark.parametrize("func", ["min", "max"])
+ @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
+ @pytest.mark.parametrize(
+ "dtype,expected",
+ [
+ (SparseDtype(np.float64, np.nan), np.nan),
+ (SparseDtype(np.float64, 5.0), np.nan),
+ (SparseDtype("datetime64[ns]", pd.NaT), pd.NaT),
+ (SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT),
+ ],
+ )
+ def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
+ arr = SparseArray(data, dtype=dtype)
+ result = getattr(arr, func)()
+ if expected == pd.NaT:
+ assert result == pd.NaT
+ else:
+ assert np.isnan(result)