-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: Rolling rank #43338
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Rolling rank #43338
Changes from 14 commits
3ebf8c0 ce754f7 f13a720 874c980 4d06ba3 1308208 4caa51b f2ee5b2 b135f1e fda85b4 e692ce3 6b23fc0 5f7d319 63d37c5 ba468c6 e078119 bb7005f 1470c7b File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -87,6 +87,53 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow | |
| :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines | ||
| with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`) | ||
| | ||
| .. _whatsnew_140.enhancements.window_rank: | ||
| | ||
| Rank function for rolling and expanding windows | ||
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
| | ||
| Added ``rank`` function to :class:`Rolling` and :class:`Expanding`. The new function supports the ``method``, ``ascending``, and ``pct`` flags of :meth:`DataFrame.rank`. The ``method`` argument supports ``min``, ``max``, and ``average`` ranking methods. | ||
| Example: | ||
| | ||
| .. ipython:: python | ||
| | ||
| >>> s = pd.Series([1, 4, 2, 3, 5, 3]) | ||
| >>> s.rolling(3).rank() | ||
| 0 NaN | ||
| 1 NaN | ||
| 2 2.0 | ||
| 3 2.0 | ||
| 4 3.0 | ||
| 5 1.5 | ||
| dtype: float64 | ||
| | ||
| >>> s.rolling(3).rank(method="max") | ||
| 0 NaN | ||
| 1 NaN | ||
| 2 2.0 | ||
| 3 2.0 | ||
| 4 3.0 | ||
| 5 2.0 | ||
| dtype: float64 | ||
| | ||
| >>> s.expanding().rank() | ||
| ||
| 0 1.0 | ||
| 1 2.0 | ||
| 2 2.0 | ||
| 3 3.0 | ||
| 4 5.0 | ||
| 5 3.5 | ||
| dtype: float64 | ||
| | ||
| >>> s.expanding().rank(method="max") | ||
| 0 1.0 | ||
| 1 2.0 | ||
| 2 2.0 | ||
| 3 3.0 | ||
| 4 5.0 | ||
| 5 4.0 | ||
| dtype: float64 | ||
| | ||
| .. _whatsnew_140.enhancements.other: | ||
| | ||
| Other enhancements | ||
| | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| | @@ -50,6 +50,8 @@ cdef extern from "../src/skiplist.h": | |||||
| double skiplist_get(skiplist_t*, int, int*) nogil | ||||||
| int skiplist_insert(skiplist_t*, double) nogil | ||||||
| int skiplist_remove(skiplist_t*, double) nogil | ||||||
| int skiplist_rank(skiplist_t*, double) nogil | ||||||
| int skiplist_min_rank(skiplist_t*, double) nogil | ||||||
| | ||||||
| cdef: | ||||||
| float32_t MINfloat32 = np.NINF | ||||||
| | @@ -795,7 +797,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, | |||||
| val = values[j] | ||||||
| if notnan(val): | ||||||
| nobs += 1 | ||||||
| err = skiplist_insert(sl, val) != 1 | ||||||
| err = skiplist_insert(sl, val) == -1 | ||||||
| if err: | ||||||
| break | ||||||
| | ||||||
| | @@ -806,7 +808,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, | |||||
| val = values[j] | ||||||
| if notnan(val): | ||||||
| nobs += 1 | ||||||
| err = skiplist_insert(sl, val) != 1 | ||||||
| err = skiplist_insert(sl, val) == -1 | ||||||
| if err: | ||||||
| break | ||||||
| | ||||||
| | @@ -1139,6 +1141,120 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, | |||||
| return output | ||||||
| | ||||||
| | ||||||
| cdef enum RankType: | ||||||
| ||||||
| AVERAGE, | ||||||
| MIN, | ||||||
| MAX, | ||||||
| | ||||||
| | ||||||
| rank_types = { | ||||||
| 'average': AVERAGE, | ||||||
| 'min': MIN, | ||||||
| 'max': MAX, | ||||||
| } | ||||||
| | ||||||
| | ||||||
| def roll_rank(const float64_t[:] values, ndarray[int64_t] start, | ||||||
| ndarray[int64_t] end, int64_t minp, bint percentile, | ||||||
| str method, bint ascending) -> np.ndarray: | ||||||
| """ | ||||||
| O(N log(window)) implementation using skip list | ||||||
| | ||||||
| derived from roll_quantile | ||||||
| """ | ||||||
| cdef: | ||||||
| Py_ssize_t i, j, s, e, N = len(values), idx | ||||||
| float64_t rank_min = 0, rank = 0 | ||||||
| int64_t nobs = 0, win | ||||||
| float64_t val | ||||||
| skiplist_t *skiplist | ||||||
| float64_t[::1] output = None | ||||||
| ||||||
| float64_t[::1] output = None | |
| float64_t[::1] output |
NBD since doesn't affect correctness, but I find this clearer since None initialization usually used only when there's a path where the variable might not end up initialized. Also generates a bit less code :)
jreback marked this conversation as resolved. Show resolved Hide resolved
jreback marked this conversation as resolved. Show resolved Hide resolved
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the cast here necessary?
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -564,6 +564,81 @@ def quantile( | |
| **kwargs, | ||
| ) | ||
| | ||
| @doc( | ||
| template_header, | ||
| ".. versionadded:: 1.4.0 \n\n", | ||
| create_section_header("Parameters"), | ||
| dedent( | ||
| """ | ||
| method : {{'average', 'min', 'max'}}, default 'average' | ||
| How to rank the group of records that have the same value (i.e. ties): | ||
| | ||
| * average: average rank of the group | ||
| * min: lowest rank in the group | ||
| * max: highest rank in the group | ||
| | ||
| ascending : bool, default True | ||
| Whether or not the elements should be ranked in ascending order. | ||
| pct : bool, default False | ||
| Whether or not to display the returned rankings in percentile | ||
| form. | ||
| """ | ||
| ).replace("\n", "", 1), | ||
| kwargs_compat, | ||
| create_section_header("Returns"), | ||
| template_returns, | ||
| create_section_header("See Also"), | ||
| template_see_also, | ||
| create_section_header("Examples"), | ||
| dedent( | ||
| """ | ||
| >>> s = pd.Series([1, 4, 2, 3, 5, 3]) | ||
| >>> s.expanding().rank() | ||
| 0 1.0 | ||
| 1 2.0 | ||
| 2 2.0 | ||
| 3 3.0 | ||
| 4 5.0 | ||
| 5 3.5 | ||
| dtype: float64 | ||
| | ||
| >>> s.expanding().rank(method="max") | ||
| 0 1.0 | ||
| 1 2.0 | ||
| 2 2.0 | ||
| 3 3.0 | ||
| 4 5.0 | ||
| 5 4.0 | ||
| dtype: float64 | ||
| | ||
| >>> s.expanding().rank(method="min") | ||
| 0 1.0 | ||
| 1 2.0 | ||
| 2 2.0 | ||
| 3 3.0 | ||
| 4 5.0 | ||
| 5 3.0 | ||
| dtype: float64 | ||
| """ | ||
| ).replace("\n", "", 1), | ||
| window_method="expanding", | ||
| aggregation_description="rank", | ||
| agg_method="rank", | ||
| ) | ||
| def rank( | ||
| self, | ||
| method: str = "average", | ||
| ||
| ascending: bool = True, | ||
| pct: bool = False, | ||
| **kwargs, | ||
| ): | ||
| return super().rank( | ||
| method=method, | ||
| ascending=ascending, | ||
| pct=pct, | ||
| **kwargs, | ||
| ) | ||
| | ||
| @doc( | ||
| template_header, | ||
| create_section_header("Parameters"), | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just put the commands; they will render during the doc-build (e.g. L100 w/o the '>>>' and not below)