ENH: rolling rank

addressing comments
pandas-dev · jreback · Sep 14, 2021 · Aug 31, 2021 · Aug 31, 2021 · Aug 31, 2021
commit ba468c6eb19b422a151719fc931b4d06ba00dae2
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -99,40 +99,8 @@ Example:
 
  >>> s = pd.Series([1, 4, 2, 3, 5, 3])
  >>> s.rolling(3).rank()
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- 4 3.0
- 5 1.5
- dtype: float64
 
  >>> s.rolling(3).rank(method="max")
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- 4 3.0
- 5 2.0
- dtype: float64
-
- >>> s.expanding().rank()
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 3.5
- dtype: float64
-
- >>> s.expanding().rank(method="max")
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 4.0
- dtype: float64
 
 .. _whatsnew_140.enhancements.other:
 

diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd
@@ -2,3 +2,19 @@ from pandas._libs.util cimport numeric
 
 
 cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil
+
+cdef enum TiebreakEnumType:
+ TIEBREAK_AVERAGE
+ TIEBREAK_MIN,
+ TIEBREAK_MAX
+ TIEBREAK_FIRST
+ TIEBREAK_FIRST_DESCENDING
+ TIEBREAK_DENSE
+
+tiebreakers = {
+ "average": TIEBREAK_AVERAGE,
+ "min": TIEBREAK_MIN,
+ "max": TIEBREAK_MAX,
+ "first": TIEBREAK_FIRST,
+ "dense": TIEBREAK_DENSE,
+}
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -64,22 +64,6 @@ cdef:
  float64_t NaN = <float64_t>np.NaN
  int64_t NPY_NAT = get_nat()
 
-cdef enum TiebreakEnumType:
- TIEBREAK_AVERAGE
- TIEBREAK_MIN,
- TIEBREAK_MAX
- TIEBREAK_FIRST
- TIEBREAK_FIRST_DESCENDING
- TIEBREAK_DENSE
-
-tiebreakers = {
- "average": TIEBREAK_AVERAGE,
- "min": TIEBREAK_MIN,
- "max": TIEBREAK_MAX,
- "first": TIEBREAK_FIRST,
- "dense": TIEBREAK_DENSE,
-}
-
 
 cdef inline bint are_diff(object left, object right):
  try:

diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h
@@ -180,6 +180,8 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
  return node->value;
 }
 
+// Returns the lowest rank of all elements with value `value`, as opposed to the
+// highest rank returned by `skiplist_insert`.
 PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
  node_t *node;
  int level, rank = 0;

diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
@@ -6,6 +6,8 @@ from typing import (
 
 import numpy as np
 
+from pandas._typing import WindowingRankType
+
 def roll_sum(
  values: np.ndarray, # const float64_t[:]
  start: np.ndarray, # np.ndarray[np.int64]
@@ -69,7 +71,7 @@ def roll_rank(
  end: np.ndarray,
  minp: int,
  percentile: bool,
- method: Literal["average", "min", "max"],
+ method: WindowingRankType,
  ascending: bool,
 ) -> np.ndarray: ... # np.ndarray[float]
 def roll_apply(

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -5,6 +5,11 @@ import cython
 from libc.math cimport round
 from libcpp.deque cimport deque
 
+from pandas._libs.algos cimport (
+ TiebreakEnumType,
+ tiebreakers,
+)
+
 import numpy as np
 
 cimport numpy as cnp
@@ -1141,19 +1146,6 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
  return output
 
 
-cdef enum RankType:
- AVERAGE,
- MIN,
- MAX,
-
-
-rank_types = {
- 'average': AVERAGE,
- 'min': MIN,
- 'max': MAX,
-}
-
-
 def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
  ndarray[int64_t] end, int64_t minp, bint percentile,
  str method, bint ascending) -> np.ndarray:
@@ -1168,13 +1160,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
  int64_t nobs = 0, win
  float64_t val
  skiplist_t *skiplist
- float64_t[::1] output = None
- RankType rank_type
+ float64_t[::1] output
+ TiebreakEnumType rank_type
 
  try:
- rank_type = rank_types[method]
+ rank_type = tiebreakers[method]
  except KeyError:
  raise ValueError(f"Method '{method}' is not supported")
+ if rank_type not in (TiebreakEnumType.TIEBREAK_AVERAGE,
+ TiebreakEnumType.TIEBREAK_MIN,
+ TiebreakEnumType.TIEBREAK_MAX):
+ raise ValueError(f"Method '{method}' is not supported")
 
  is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  start, end
@@ -1210,12 +1206,20 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
  rank = skiplist_insert(skiplist, val)
  if rank == -1:
  raise MemoryError("skiplist_insert failed")
- if rank_type == AVERAGE:
+ if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
+ # The average rank of `val` is the sum of the ranks of all
+ # instances of `val` in the skip list divided by the number
+ # of instances. The sum of consecutive integers from 1 to N
+ # is N * (N + 1) / 2.
+ # The sum of the ranks is the sum of integers from the
+ # lowest rank to the highest rank, which is the sum of
+ # integers from 1 to the highest rank minus the sum of
+ # integers from 1 to one less than the lowest rank.
  rank_min = skiplist_min_rank(skiplist, val)
  rank = (((rank * (rank + 1) / 2)
  - ((rank_min - 1) * rank_min / 2))
  / (rank - rank_min + 1))
- elif rank_type == MIN:
+ elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
  rank = skiplist_min_rank(skiplist, val)
  else:
  rank = NaN
@@ -1236,17 +1240,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
  rank = skiplist_insert(skiplist, val)
  if rank == -1:
  raise MemoryError("skiplist_insert failed")
- if rank_type == AVERAGE:
+ if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
  rank_min = skiplist_min_rank(skiplist, val)
  rank = (((rank * (rank + 1) / 2)
  - ((rank_min - 1) * rank_min / 2))
  / (rank - rank_min + 1))
- elif rank_type == MIN:
+ elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
  rank = skiplist_min_rank(skiplist, val)
  else:
  rank = NaN
  if nobs >= minp:
- output[i] = <float64_t>(rank) / nobs if percentile else rank
+ output[i] = rank / nobs if percentile else rank
  else:
  output[i] = NaN
 

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -208,3 +208,6 @@
 PositionalIndexer2D = Union[
  PositionalIndexer, Tuple[PositionalIndexer, PositionalIndexer]
 ]
+
+# Windowing rank methods
+WindowingRankType = Literal["average", "min", "max"]
diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -10,6 +10,7 @@
 from pandas._typing import (
  Axis,
  FrameOrSeries,
+ WindowingRankType,
 )
 
 if TYPE_CHECKING:
@@ -627,7 +628,7 @@ def quantile(
  )
  def rank(
  self,
- method: str = "average",
+ method: WindowingRankType = "average",
  ascending: bool = True,
  pct: bool = False,
  **kwargs,

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -28,6 +28,7 @@
  ArrayLike,
  Axis,
  FrameOrSeries,
+ WindowingRankType,
 )
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.numpy import function as nv
@@ -1411,7 +1412,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
 
  def rank(
  self,
- method: str = "average",
+ method: WindowingRankType = "average",
  ascending: bool = True,
  pct: bool = False,
  **kwargs,
@@ -2239,7 +2240,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
  )
  def rank(
  self,
- method: str = "average",
+ method: WindowingRankType = "average",
  ascending: bool = True,
  pct: bool = False,
  **kwargs,

diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
@@ -266,19 +266,21 @@ def test_expanding_skew_kurt_numerical_stability(method):
  tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000])
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
 @pytest.mark.parametrize("method", ["min", "max", "average"])
 @pytest.mark.parametrize("pct", [True, False])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
 def test_rank(window, method, pct, ascending, test_data):
- length = 1000
+ length = 20
  if test_data == "default":
  ser = Series(data=np.random.rand(length))
  elif test_data == "duplicates":
  ser = Series(data=np.random.choice(3, length))
  elif test_data == "nans":
- ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length))
+ ser = Series(
+ data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
+ )
 
  expected = ser.expanding(window).apply(
  lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]

diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1502,19 +1502,21 @@ def test_rolling_numeric_dtypes():
  tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000])
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
 @pytest.mark.parametrize("method", ["min", "max", "average"])
 @pytest.mark.parametrize("pct", [True, False])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
 def test_rank(window, method, pct, ascending, test_data):
- length = 1000
+ length = 20
  if test_data == "default":
  ser = Series(data=np.random.rand(length))
  elif test_data == "duplicates":
  ser = Series(data=np.random.choice(3, length))
  elif test_data == "nans":
- ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length))
+ ser = Series(
+ data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
+ )
 
  expected = ser.rolling(window).apply(
  lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]