pandas-dev · jreback · Sep 14, 2021 · Aug 31, 2021 · Aug 31, 2021 · Aug 31, 2021
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -180,6 +180,33 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
  self.roll.quantile(percentile, interpolation=interpolation)
 
 
+class Rank:
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ [True, False],
+ [True, False],
+ ["min", "max", "average"],
+ )
+ param_names = [
+ "constructor",
+ "window",
+ "dtype",
+ "percentile",
+ "ascending",
+ "method",
+ ]
+
+ def setup(self, constructor, window, dtype, percentile, ascending, method):
+ N = 10 ** 5
+ arr = np.random.random(N).astype(dtype)
+ self.roll = getattr(pd, constructor)(arr).rolling(window)
+
+ def time_rank(self, constructor, window, dtype, percentile, ascending, method):
+ self.roll.rank(pct=percentile, ascending=ascending, method=method)
+
+
 class PeakMemFixedWindowMinMax:
 
  params = ["min", "max"]

diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -35,6 +35,7 @@ Rolling window functions
  Rolling.aggregate
  Rolling.quantile
  Rolling.sem
+ Rolling.rank
 
 .. _api.functions_window:
 
@@ -75,6 +76,7 @@ Expanding window functions
  Expanding.aggregate
  Expanding.quantile
  Expanding.sem
+ Expanding.rank
 
 .. _api.functions_ewm:
 

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -87,6 +87,53 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow
 :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
 with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
+.. _whatsnew_140.enhancements.window_rank:
+
+Rank function for rolling and expanding windows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Added ``rank`` function to :class:`Rolling` and :class:`Expanding`. The new function supports the ``method``, ``ascending``, and ``pct`` flags of :meth:`DataFrame.rank`. The ``method`` argument supports ``min``, ``max``, and ``average`` ranking methods.
+Example:
+
+.. ipython:: python
+
+ >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+ >>> s.rolling(3).rank()
+ 0 NaN
+ 1 NaN
+ 2 2.0
+ 3 2.0
+ 4 3.0
+ 5 1.5
+ dtype: float64
+
+ >>> s.rolling(3).rank(method="max")
+ 0 NaN
+ 1 NaN
+ 2 2.0
+ 3 2.0
+ 4 3.0
+ 5 2.0
+ dtype: float64
+
+ >>> s.expanding().rank()
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ 4 5.0
+ 5 3.5
+ dtype: float64
+
+ >>> s.expanding().rank(method="max")
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ 4 5.0
+ 5 4.0
+ dtype: float64
+
 .. _whatsnew_140.enhancements.other:
 
 Other enhancements

diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h
@@ -180,10 +180,28 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
  return node->value;
 }
 
+PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
+ node_t *node;
+ int level, rank = 0;
+
+ node = skp->head;
+ for (level = skp->maxlevels - 1; level >= 0; --level) {
+ while (_node_cmp(node->next[level], value) > 0) {
+ rank += node->width[level];
+ node = node->next[level];
+ }
+ }
+
+ return rank + 1;
+}
+
+// Returns the rank of the inserted element. When there are duplicates,
+// `rank` is the highest of the group, i.e. the 'max' method of
+// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
 PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
  node_t *node, *prevnode, *newnode, *next_at_level;
  int *steps_at_level;
- int size, steps, level;
+ int size, steps, level, rank = 0;
  node_t **chain;
 
  chain = skp->tmp_chain;
@@ -197,6 +215,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
  next_at_level = node->next[level];
  while (_node_cmp(next_at_level, value) >= 0) {
  steps_at_level[level] += node->width[level];
+ rank += node->width[level];
  node = next_at_level;
  next_at_level = node->next[level];
  }
@@ -230,7 +249,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
 
  ++(skp->size);
 
- return 1;
+ return rank + 1;
 }
 
 PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {

diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
@@ -63,6 +63,15 @@ def roll_quantile(
  quantile: float, # float64_t
  interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
 ) -> np.ndarray: ... # np.ndarray[float]
+def roll_rank(
+ values: np.ndarray,
+ start: np.ndarray,
+ end: np.ndarray,
+ minp: int,
+ percentile: bool,
+ method: Literal["average", "min", "max"],
+ ascending: bool,
+) -> np.ndarray: ... # np.ndarray[float]
 def roll_apply(
  obj: object,
  start: np.ndarray, # np.ndarray[np.int64]

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -50,6 +50,8 @@ cdef extern from "../src/skiplist.h":
  double skiplist_get(skiplist_t*, int, int*) nogil
  int skiplist_insert(skiplist_t*, double) nogil
  int skiplist_remove(skiplist_t*, double) nogil
+ int skiplist_rank(skiplist_t*, double) nogil
+ int skiplist_min_rank(skiplist_t*, double) nogil
 
 cdef:
  float32_t MINfloat32 = np.NINF
@@ -795,7 +797,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
  val = values[j]
  if notnan(val):
  nobs += 1
- err = skiplist_insert(sl, val) != 1
+ err = skiplist_insert(sl, val) == -1
  if err:
  break
 
@@ -806,7 +808,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
  val = values[j]
  if notnan(val):
  nobs += 1
- err = skiplist_insert(sl, val) != 1
+ err = skiplist_insert(sl, val) == -1
  if err:
  break
 
@@ -1139,6 +1141,120 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
  return output
 
 
+cdef enum RankType:
+ AVERAGE,
+ MIN,
+ MAX,
+
+
+rank_types = {
+ 'average': AVERAGE,
+ 'min': MIN,
+ 'max': MAX,
+}
+
+
+def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
+ ndarray[int64_t] end, int64_t minp, bint percentile,
+ str method, bint ascending) -> np.ndarray:
+ """
+ O(N log(window)) implementation using skip list
+
+ derived from roll_quantile
+ """
+ cdef:
+ Py_ssize_t i, j, s, e, N = len(values), idx
+ float64_t rank_min = 0, rank = 0
+ int64_t nobs = 0, win
+ float64_t val
+ skiplist_t *skiplist
+ float64_t[::1] output = None
- float64_t[::1] output = None
+ float64_t[::1] output
- float64_t[::1] output = None
+ float64_t[::1] output
+ RankType rank_type
+
+ try:
+ rank_type = rank_types[method]
+ except KeyError:
+ raise ValueError(f"Method '{method}' is not supported")
+
+ is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
+ start, end
+ )
+ # we use the Fixed/Variable Indexer here as the
+ # actual skiplist ops outweigh any window computation costs
+ output = np.empty(N, dtype=np.float64)
+
+ win = (end - start).max()
+ if win == 0:
+ output[:] = NaN
+ return np.asarray(output)
+ skiplist = skiplist_init(<int>win)
+ if skiplist == NULL:
+ raise MemoryError("skiplist_init failed")
+
+ with nogil:
+ for i in range(N):
+ s = start[i]
+ e = end[i]
+
+ if i == 0 or not is_monotonic_increasing_bounds:
+ if not is_monotonic_increasing_bounds:
+ nobs = 0
+ skiplist_destroy(skiplist)
+ skiplist = skiplist_init(<int>win)
+
+ # setup
+ for j in range(s, e):
+ val = values[j] if ascending else -values[j]
+ if notnan(val):
+ nobs += 1
+ rank = skiplist_insert(skiplist, val)
+ if rank == -1:
+ raise MemoryError("skiplist_insert failed")
+ if rank_type == AVERAGE:
+ rank_min = skiplist_min_rank(skiplist, val)
+ rank = (((rank * (rank + 1) / 2)
+ - ((rank_min - 1) * rank_min / 2))
+ / (rank - rank_min + 1))
+ elif rank_type == MIN:
+ rank = skiplist_min_rank(skiplist, val)
+ else:
+ rank = NaN
+
+ else:
+ # calculate deletes
+ for j in range(start[i - 1], s):
+ val = values[j] if ascending else -values[j]
+ if notnan(val):
+ skiplist_remove(skiplist, val)
+ nobs -= 1
+
+ # calculate adds
+ for j in range(end[i - 1], e):
+ val = values[j] if ascending else -values[j]
+ if notnan(val):
+ nobs += 1
+ rank = skiplist_insert(skiplist, val)
+ if rank == -1:
+ raise MemoryError("skiplist_insert failed")
+ if rank_type == AVERAGE:
+ rank_min = skiplist_min_rank(skiplist, val)
+ rank = (((rank * (rank + 1) / 2)
+ - ((rank_min - 1) * rank_min / 2))
+ / (rank - rank_min + 1))
+ elif rank_type == MIN:
+ rank = skiplist_min_rank(skiplist, val)
+ else:
+ rank = NaN
+ if nobs >= minp:
+ output[i] = <float64_t>(rank) / nobs if percentile else rank
+ else:
+ output[i] = NaN
+
+ skiplist_destroy(skiplist)
+
+ return np.asarray(output)
+
+
 def roll_apply(object obj,
  ndarray[int64_t] start, ndarray[int64_t] end,
  int64_t minp,

diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -564,6 +564,81 @@ def quantile(
  **kwargs,
  )
 
+ @doc(
+ template_header,
+ ".. versionadded:: 1.4.0 \n\n",
+ create_section_header("Parameters"),
+ dedent(
+ """
+ method : {{'average', 'min', 'max'}}, default 'average'
+ How to rank the group of records that have the same value (i.e. ties):
+
+ * average: average rank of the group
+ * min: lowest rank in the group
+ * max: highest rank in the group
+
+ ascending : bool, default True
+ Whether or not the elements should be ranked in ascending order.
+ pct : bool, default False
+ Whether or not to display the returned rankings in percentile
+ form.
+ """
+ ).replace("\n", "", 1),
+ kwargs_compat,
+ create_section_header("Returns"),
+ template_returns,
+ create_section_header("See Also"),
+ template_see_also,
+ create_section_header("Examples"),
+ dedent(
+ """
+ >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+ >>> s.expanding().rank()
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ 4 5.0
+ 5 3.5
+ dtype: float64
+
+ >>> s.expanding().rank(method="max")
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ 4 5.0
+ 5 4.0
+ dtype: float64
+
+ >>> s.expanding().rank(method="min")
+ 0 1.0
+ 1 2.0
+ 2 2.0
+ 3 3.0
+ 4 5.0
+ 5 3.0
+ dtype: float64
+ """
+ ).replace("\n", "", 1),
+ window_method="expanding",
+ aggregation_description="rank",
+ agg_method="rank",
+ )
+ def rank(
+ self,
+ method: str = "average",
+ ascending: bool = True,
+ pct: bool = False,
+ **kwargs,
+ ):
+ return super().rank(
+ method=method,
+ ascending=ascending,
+ pct=pct,
+ **kwargs,
+ )
+
  @doc(
  template_header,
  create_section_header("Parameters"),