Skip to content
Prev Previous commit
Next Next commit
ENH: rolling rank
addressing comments
  • Loading branch information
gsiano committed Sep 13, 2021
commit ba468c6eb19b422a151719fc931b4d06ba00dae2
32 changes: 0 additions & 32 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,40 +99,8 @@ Example:

>>> s = pd.Series([1, 4, 2, 3, 5, 3])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just put the commands; they will render during the doc-build (e.g. L100 w/o the '>>>' and not below)

>>> s.rolling(3).rank()
0 NaN
1 NaN
2 2.0
3 2.0
4 3.0
5 1.5
dtype: float64

>>> s.rolling(3).rank(method="max")
0 NaN
1 NaN
2 2.0
3 2.0
4 3.0
5 2.0
dtype: float64

>>> s.expanding().rank()
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 3.5
dtype: float64

>>> s.expanding().rank(method="max")
0 1.0
1 2.0
2 2.0
3 3.0
4 5.0
5 4.0
dtype: float64

.. _whatsnew_140.enhancements.other:

Expand Down
16 changes: 16 additions & 0 deletions pandas/_libs/algos.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,19 @@ from pandas._libs.util cimport numeric


cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil

cdef enum TiebreakEnumType:
TIEBREAK_AVERAGE
TIEBREAK_MIN,
TIEBREAK_MAX
TIEBREAK_FIRST
TIEBREAK_FIRST_DESCENDING
TIEBREAK_DENSE

tiebreakers = {
"average": TIEBREAK_AVERAGE,
"min": TIEBREAK_MIN,
"max": TIEBREAK_MAX,
"first": TIEBREAK_FIRST,
"dense": TIEBREAK_DENSE,
}
16 changes: 0 additions & 16 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,6 @@ cdef:
float64_t NaN = <float64_t>np.NaN
int64_t NPY_NAT = get_nat()

cdef enum TiebreakEnumType:
TIEBREAK_AVERAGE
TIEBREAK_MIN,
TIEBREAK_MAX
TIEBREAK_FIRST
TIEBREAK_FIRST_DESCENDING
TIEBREAK_DENSE

tiebreakers = {
"average": TIEBREAK_AVERAGE,
"min": TIEBREAK_MIN,
"max": TIEBREAK_MAX,
"first": TIEBREAK_FIRST,
"dense": TIEBREAK_DENSE,
}


cdef inline bint are_diff(object left, object right):
try:
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/src/skiplist.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
return node->value;
}

// Returns the lowest rank of all elements with value `value`, as opposed to the
// highest rank returned by `skiplist_insert`.
PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
node_t *node;
int level, rank = 0;
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/window/aggregations.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ from typing import (

import numpy as np

from pandas._typing import WindowingRankType

def roll_sum(
values: np.ndarray, # const float64_t[:]
start: np.ndarray, # np.ndarray[np.int64]
Expand Down Expand Up @@ -69,7 +71,7 @@ def roll_rank(
end: np.ndarray,
minp: int,
percentile: bool,
method: Literal["average", "min", "max"],
method: WindowingRankType,
ascending: bool,
) -> np.ndarray: ... # np.ndarray[float]
def roll_apply(
Expand Down
46 changes: 25 additions & 21 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ import cython
from libc.math cimport round
from libcpp.deque cimport deque

from pandas._libs.algos cimport (
TiebreakEnumType,
tiebreakers,
)

import numpy as np

cimport numpy as cnp
Expand Down Expand Up @@ -1141,19 +1146,6 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
return output


cdef enum RankType:
AVERAGE,
MIN,
MAX,


rank_types = {
'average': AVERAGE,
'min': MIN,
'max': MAX,
}


def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp, bint percentile,
str method, bint ascending) -> np.ndarray:
Expand All @@ -1168,13 +1160,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
int64_t nobs = 0, win
float64_t val
skiplist_t *skiplist
float64_t[::1] output = None
RankType rank_type
float64_t[::1] output
TiebreakEnumType rank_type

try:
rank_type = rank_types[method]
rank_type = tiebreakers[method]
except KeyError:
raise ValueError(f"Method '{method}' is not supported")
if rank_type not in (TiebreakEnumType.TIEBREAK_AVERAGE,
TiebreakEnumType.TIEBREAK_MIN,
TiebreakEnumType.TIEBREAK_MAX):
raise ValueError(f"Method '{method}' is not supported")

is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
start, end
Expand Down Expand Up @@ -1210,12 +1206,20 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
rank = skiplist_insert(skiplist, val)
if rank == -1:
raise MemoryError("skiplist_insert failed")
if rank_type == AVERAGE:
if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
# The average rank of `val` is the sum of the ranks of all
# instances of `val` in the skip list divided by the number
# of instances. The sum of consecutive integers from 1 to N
# is N * (N + 1) / 2.
# The sum of the ranks is the sum of integers from the
# lowest rank to the highest rank, which is the sum of
# integers from 1 to the highest rank minus the sum of
# integers from 1 to one less than the lowest rank.
rank_min = skiplist_min_rank(skiplist, val)
rank = (((rank * (rank + 1) / 2)
- ((rank_min - 1) * rank_min / 2))
/ (rank - rank_min + 1))
elif rank_type == MIN:
elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
rank = skiplist_min_rank(skiplist, val)
else:
rank = NaN
Expand All @@ -1236,17 +1240,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
rank = skiplist_insert(skiplist, val)
if rank == -1:
raise MemoryError("skiplist_insert failed")
if rank_type == AVERAGE:
if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
rank_min = skiplist_min_rank(skiplist, val)
rank = (((rank * (rank + 1) / 2)
- ((rank_min - 1) * rank_min / 2))
/ (rank - rank_min + 1))
elif rank_type == MIN:
elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
rank = skiplist_min_rank(skiplist, val)
else:
rank = NaN
if nobs >= minp:
output[i] = <float64_t>(rank) / nobs if percentile else rank
output[i] = rank / nobs if percentile else rank
else:
output[i] = NaN

Expand Down
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,6 @@
PositionalIndexer2D = Union[
PositionalIndexer, Tuple[PositionalIndexer, PositionalIndexer]
]

# Windowing rank methods
WindowingRankType = Literal["average", "min", "max"]
3 changes: 2 additions & 1 deletion pandas/core/window/expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas._typing import (
Axis,
FrameOrSeries,
WindowingRankType,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -627,7 +628,7 @@ def quantile(
)
def rank(
self,
method: str = "average",
method: WindowingRankType = "average",
ascending: bool = True,
pct: bool = False,
**kwargs,
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
ArrayLike,
Axis,
FrameOrSeries,
WindowingRankType,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -1411,7 +1412,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):

def rank(
self,
method: str = "average",
method: WindowingRankType = "average",
ascending: bool = True,
pct: bool = False,
**kwargs,
Expand Down Expand Up @@ -2239,7 +2240,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs):
)
def rank(
self,
method: str = "average",
method: WindowingRankType = "average",
ascending: bool = True,
pct: bool = False,
**kwargs,
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,19 +266,21 @@ def test_expanding_skew_kurt_numerical_stability(method):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000])
@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("method", ["min", "max", "average"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
def test_rank(window, method, pct, ascending, test_data):
length = 1000
length = 20
if test_data == "default":
ser = Series(data=np.random.rand(length))
elif test_data == "duplicates":
ser = Series(data=np.random.choice(3, length))
elif test_data == "nans":
ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length))
ser = Series(
data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
)

expected = ser.expanding(window).apply(
lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1502,19 +1502,21 @@ def test_rolling_numeric_dtypes():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000])
@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("method", ["min", "max", "average"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
def test_rank(window, method, pct, ascending, test_data):
length = 1000
length = 20
if test_data == "default":
ser = Series(data=np.random.rand(length))
elif test_data == "duplicates":
ser = Series(data=np.random.choice(3, length))
elif test_data == "nans":
ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length))
ser = Series(
data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length)
)

expected = ser.rolling(window).apply(
lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
Expand Down