Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1f8c628
init commit kendall spearman ordinal cats
pandeconscious Oct 23, 2025
906f1e4
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Oct 27, 2025
497dc7e
series test update and fixes
pandeconscious Oct 27, 2025
583aca6
cat desc longer in tests
pandeconscious Oct 27, 2025
e069810
testing frame corr
pandeconscious Oct 27, 2025
b90726f
pre commit fixes v2
pandeconscious Oct 27, 2025
65a506c
cleanup
pandeconscious Oct 27, 2025
ab3b8b9
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
e93ed83
test import scipy fix
pandeconscious Nov 4, 2025
ec4d97e
rst sorting autofix
pandeconscious Nov 4, 2025
ebfc3b0
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 4, 2025
8cfacef
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 5, 2025
7ef7fb2
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 12, 2025
588808a
refactor
pandeconscious Nov 12, 2025
c484552
fix dtype for duplicates
pandeconscious Nov 12, 2025
216475c
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
e997747
clean up
pandeconscious Nov 16, 2025
4184167
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 16, 2025
8bcd3dc
Merge branch 'pandas-dev:main' into ordered_cat_corr
pandeconscious Nov 18, 2025
2673281
clean up
pandeconscious Nov 18, 2025
ff48847
import fix
pandeconscious Nov 18, 2025
1c69e29
test tranform ordered cat func
pandeconscious Nov 18, 2025
8b26a7d
tests and mypy fixes
pandeconscious Nov 18, 2025
a625520
type check fix
pandeconscious Nov 18, 2025
259424e
addressing review comments
pandeconscious Nov 18, 2025
f141e6a
Merge branch 'main' into ordered_cat_corr
pandeconscious Nov 18, 2025
d2d0f71
type fix corr.py
pandeconscious Nov 19, 2025
858d0c2
ruff format
pandeconscious Nov 19, 2025
a8c88c7
mypy fix
pandeconscious Nov 19, 2025
1a472e3
Merge branch 'main' into ordered_cat_corr
pandeconscious Nov 19, 2025
71305aa
scipy unavailable fix in test
pandeconscious Nov 19, 2025
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
addressing review comments
  • Loading branch information
pandeconscious committed Nov 18, 2025
commit 259424e6ba56c60e3912c11ca16763b476f352db
3 changes: 1 addition & 2 deletions pandas/core/methods/corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
"""
any ordered categorical columns are transformed to the respective
categorical codes while other columns remain untouched
Replace ordered categoricals with their codes, making a shallow copy if necessary.
"""

result = df
Expand Down
48 changes: 15 additions & 33 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,31 +262,21 @@ def test_corr_rank_ordered_categorical(
):
df = DataFrame(
{
"ord_cat": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
"ord_cat": pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"ord_cat_none": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
"ord_cat_none": pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
),
"ord_int": Series([0, 1, 2, 3]),
"ord_float": Series([2.0, 3.0, 4.5, 6.5]),
"ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
"ord_cat_shuff": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
"ord_cat_shuff": pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
),
"ord_int_shuff": Series([2, 3, 0, 1]),
}
)
corr_calc = df.corr(method=method)
Expand All @@ -300,24 +290,16 @@ def test_corr_rank_ordered_categorical_duplicate_columns(
self,
method,
):
cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
df = DataFrame(
{
"a": [1, 2, 3, 4],
"b": [4, 3, 2, 1],
"a": pd.array([1, 2, 3, 4], dtype=cat),
"b": pd.array([4, 3, 2, 1], dtype=cat),
"c": [4, 3, 2, 1],
"d": [10, 20, 30, 40],
"e": [100, 200, 300, 400],
}
)
df["a"] = (
df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
)
df["b"] = (
df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
)
df["c"] = (
df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
)
df.columns = ["a", "a", "c", "c", "e"]

corr_calc = df.corr(method=method)
Expand Down
34 changes: 22 additions & 12 deletions pandas/tests/methods/corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
Tests for core/methods/corr.py
"""

import pytest
import numpy as np
from pandas import DataFrame, Series, Categorical
import pytest

from pandas import (
Categorical,
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols

Expand Down Expand Up @@ -75,22 +80,22 @@
# second 'dup' is non-categorical
DataFrame(
{
"dup": Series(
"dup_1": Series(
Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
ordered=True,
)
),
"dup": Series([5, 6, 7]), # duplicate name, later column
"dup_2": Series([5, 6, 7]), # duplicate name, later column
}
),
DataFrame(
{
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
# position 1 remains untouched numbers [5,6,7].
"dup": Series([0, 1, 2], dtype="int8"),
"dup": Series([5, 6, 7]),
"dup_1": Series([0, 1, 2], dtype="int8"),
"dup_2": Series([5, 6, 7]),
}
),
id="duplicate-names-ordered-first",
Expand All @@ -100,15 +105,15 @@
# second 'dup' is ordered categorical, third 'dup' is ordered categorical
DataFrame(
{
"dup": Series(["a", "b", "c"]), # non-categorical (object)
"dup": Series(
"dup_1": Series(["a", "b", "c"]), # non-categorical (object)
"dup_2": Series(
Categorical(
["p", "q", None],
categories=["p", "q"],
ordered=True,
)
),
"dup": Series(
"dup_3": Series(
Categorical(
["low", "m", "h"],
categories=["low", "m", "h"],
Expand All @@ -121,16 +126,21 @@
{
# First stays object; second turns into codes [0, 1, NaN]
# and third changes into codes [0, 1, 2]
"dup": Series(["a", "b", "c"]),
"dup": Series([0.0, 1.0, np.nan]),
"dup": Series([0, 1, 2], dtype="int8"),
"dup_1": Series(["a", "b", "c"]),
"dup_2": Series([0.0, 1.0, np.nan]),
"dup_3": Series([0, 1, 2], dtype="int8"),
}
),
id="duplicate-names-ordered-and-non-categorical-and-none",
),
],
)
def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this test is necessary; your other tests are sufficient.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function in itself can also be potentially used for things other than correlation as it is a specific type of transformation. Correlation is one use case of transforming to these codes, so to me it seems like this function should be anyway tested for what it is supposed to do irrespective of its use in correlation. Please lmk what do you think.

# duplicate columns creation for dup columns
if "dup_1" in input_df.columns:
input_df.columns = ["dup" for _ in range(len(input_df.columns))]
expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]

out_df = transform_ord_cat_cols_to_coded_cols(input_df)
assert list(out_df.columns) == list(expected_df.columns)
for i, col in enumerate(out_df.columns):
Expand Down
49 changes: 18 additions & 31 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,19 +187,19 @@ def test_corr_callable_method(self, datetime_series):

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@pytest.mark.parametrize(
"ord_cat_series",
"cat_series",
[
Series( # ordered categorical series
pd.Categorical(
["low", "med", "high", "very_high"],
categories=["low", "med", "high", "very_high"],
Series(
pd.Categorical( # ordered cat series
["low", "medium", "high"],
categories=["low", "medium", "high"],
ordered=True,
)
),
Series( # ordered categorical series with nan and a different ranking
pd.Categorical(
["h", "low", "vh", None],
categories=["low", "m", "h", "vh"],
Series(
pd.Categorical( # ordered cat series with NA
["low", "medium", "high", None],
categories=["low", "medium", "high"],
ordered=True,
)
),
Expand All @@ -208,36 +208,23 @@ def test_corr_callable_method(self, datetime_series):
@pytest.mark.parametrize(
"other_series",
[
Series( # int series against which tord cat series is correlated
[0, 1, 2, 3]
),
Series( # float series against which ord cat series is correlated
[2.0, 3.0, 4.5, 6.5]
),
Series( # other ord cat series against which ord cat series is correlated
Series( # other cat ordered series
pd.Categorical(
["high", "low", "very_high", "med"],
categories=["low", "med", "high", "very_high"],
["m", "l", "h"],
categories=["l", "m", "h"],
ordered=True,
)
),
# other non cat series
Series([2, 1, 3]),
],
)
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

self,
method,
ord_cat_series,
cat_series,
other_series,
):
stats = pytest.importorskip("scipy.stats")
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan)

if other_series.dtype == "category" and other_series.cat.ordered:
other_series = other_series.cat.codes.replace(-1, np.nan)

corr_calc = ord_cat_series.corr(other_series, method=method)
corr_expected = method_scipy_func[method](
ord_ser_cat_codes, other_series, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)
expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
corr_calc = cat_series.corr(other_series, method=method)
tm.assert_almost_equal(corr_calc, expected_corr[method])
Loading