addressing review comments

pandas-dev · pandeconscious · Oct 23, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
commit 259424e6ba56c60e3912c11ca16763b476f352db
diff --git a/pandas/core/methods/corr.py b/pandas/core/methods/corr.py
@@ -16,8 +16,7 @@
 
 def transform_ord_cat_cols_to_coded_cols(df: DataFrame) -> DataFrame:
  """
- any ordered categorical columns are transformed to the respective
- categorical codes while other columns remain untouched
+ Replace ordered categoricals with their codes, making a shallow copy if necessary.
  """
 
  result = df

diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -262,31 +262,21 @@ def test_corr_rank_ordered_categorical(
  ):
  df = DataFrame(
  {
- "ord_cat": Series(
- pd.Categorical(
- ["low", "m", "h", "vh"],
- categories=["low", "m", "h", "vh"],
- ordered=True,
- )
+ "ord_cat": pd.Categorical(
+ ["low", "m", "h", "vh"],
+ categories=["low", "m", "h", "vh"],
+ ordered=True,
  ),
- "ord_cat_none": Series(
- pd.Categorical(
- ["low", "m", "h", None],
- categories=["low", "m", "h"],
- ordered=True,
- )
+ "ord_cat_none": pd.Categorical(
+ ["low", "m", "h", None],
+ categories=["low", "m", "h"],
+ ordered=True,
  ),
- "ord_int": Series([0, 1, 2, 3]),
- "ord_float": Series([2.0, 3.0, 4.5, 6.5]),
- "ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
- "ord_cat_shuff": Series(
- pd.Categorical(
- ["m", "h", "vh", "low"],
- categories=["low", "m", "h", "vh"],
- ordered=True,
- )
+ "ord_cat_shuff": pd.Categorical(
+ ["m", "h", "vh", "low"],
+ categories=["low", "m", "h", "vh"],
+ ordered=True,
  ),
- "ord_int_shuff": Series([2, 3, 0, 1]),
  }
  )
  corr_calc = df.corr(method=method)
@@ -300,24 +290,16 @@ def test_corr_rank_ordered_categorical_duplicate_columns(
  self,
  method,
  ):
+ cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
  df = DataFrame(
  {
- "a": [1, 2, 3, 4],
- "b": [4, 3, 2, 1],
+ "a": pd.array([1, 2, 3, 4], dtype=cat),
+ "b": pd.array([4, 3, 2, 1], dtype=cat),
  "c": [4, 3, 2, 1],
  "d": [10, 20, 30, 40],
  "e": [100, 200, 300, 400],
  }
  )
- df["a"] = (
- df["a"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
- )
- df["b"] = (
- df["b"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
- )
- df["c"] = (
- df["c"].astype("category").cat.set_categories([4, 3, 2, 1], ordered=True)
- )
  df.columns = ["a", "a", "c", "c", "e"]
 
  corr_calc = df.corr(method=method)

diff --git a/pandas/tests/methods/corr.py b/pandas/tests/methods/corr.py
@@ -2,9 +2,14 @@
 Tests for core/methods/corr.py
 """
 
-import pytest
 import numpy as np
-from pandas import DataFrame, Series, Categorical
+import pytest
+
+from pandas import (
+ Categorical,
+ DataFrame,
+ Series,
+)
 import pandas._testing as tm
 from pandas.core.methods.corr import transform_ord_cat_cols_to_coded_cols
 
@@ -75,22 +80,22 @@
  # second 'dup' is non-categorical
  DataFrame(
  {
- "dup": Series(
+ "dup_1": Series(
  Categorical(
  ["low", "m", "h"],
  categories=["low", "m", "h"],
  ordered=True,
  )
  ),
- "dup": Series([5, 6, 7]), # duplicate name, later column
+ "dup_2": Series([5, 6, 7]), # duplicate name, later column
  }
  ),
  DataFrame(
  {
  # After transform: position 0 (ordered cat) becomes codes [0,1,2],
  # position 1 remains untouched numbers [5,6,7].
- "dup": Series([0, 1, 2], dtype="int8"),
- "dup": Series([5, 6, 7]),
+ "dup_1": Series([0, 1, 2], dtype="int8"),
+ "dup_2": Series([5, 6, 7]),
  }
  ),
  id="duplicate-names-ordered-first",
@@ -100,15 +105,15 @@
  # second 'dup' is ordered categorical, third 'dup' is ordered categorical
  DataFrame(
  {
- "dup": Series(["a", "b", "c"]), # non-categorical (object)
- "dup": Series(
+ "dup_1": Series(["a", "b", "c"]), # non-categorical (object)
+ "dup_2": Series(
  Categorical(
  ["p", "q", None],
  categories=["p", "q"],
  ordered=True,
  )
  ),
- "dup": Series(
+ "dup_3": Series(
  Categorical(
  ["low", "m", "h"],
  categories=["low", "m", "h"],
@@ -121,16 +126,21 @@
  {
  # First stays object; second turns into codes [0, 1, NaN]
  # and third changes into codes [0, 1, 2]
- "dup": Series(["a", "b", "c"]),
- "dup": Series([0.0, 1.0, np.nan]),
- "dup": Series([0, 1, 2], dtype="int8"),
+ "dup_1": Series(["a", "b", "c"]),
+ "dup_2": Series([0.0, 1.0, np.nan]),
+ "dup_3": Series([0, 1, 2], dtype="int8"),
  }
  ),
  id="duplicate-names-ordered-and-non-categorical-and-none",
  ),
  ],
 )
 def test_transform_ord_cat_cols_to_coded_cols(input_df, expected_df):
+ # duplicate columns creation for dup columns
+ if "dup_1" in input_df.columns:
+ input_df.columns = ["dup" for _ in range(len(input_df.columns))]
+ expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]
+
  out_df = transform_ord_cat_cols_to_coded_cols(input_df)
  assert list(out_df.columns) == list(expected_df.columns)
  for i, col in enumerate(out_df.columns):

diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py
@@ -187,19 +187,19 @@ def test_corr_callable_method(self, datetime_series):
 
  @pytest.mark.parametrize("method", ["kendall", "spearman"])
  @pytest.mark.parametrize(
- "ord_cat_series",
+ "cat_series",
  [
- Series( # ordered categorical series
- pd.Categorical(
- ["low", "med", "high", "very_high"],
- categories=["low", "med", "high", "very_high"],
+ Series(
+ pd.Categorical( # ordered cat series
+ ["low", "medium", "high"],
+ categories=["low", "medium", "high"],
  ordered=True,
  )
  ),
- Series( # ordered categorical series with nan and a different ranking
- pd.Categorical(
- ["h", "low", "vh", None],
- categories=["low", "m", "h", "vh"],
+ Series(
+ pd.Categorical( # ordered cat series with NA
+ ["low", "medium", "high", None],
+ categories=["low", "medium", "high"],
  ordered=True,
  )
  ),
@@ -208,36 +208,23 @@ def test_corr_callable_method(self, datetime_series):
  @pytest.mark.parametrize(
  "other_series",
  [
- Series( # int series against which tord cat series is correlated
- [0, 1, 2, 3]
- ),
- Series( # float series against which ord cat series is correlated
- [2.0, 3.0, 4.5, 6.5]
- ),
- Series( # other ord cat series against which ord cat series is correlated
+ Series( # other cat ordered series
  pd.Categorical(
- ["high", "low", "very_high", "med"],
- categories=["low", "med", "high", "very_high"],
+ ["m", "l", "h"],
+ categories=["l", "m", "h"],
  ordered=True,
  )
  ),
+ # other non cat series
+ Series([2, 1, 3]),
  ],
  )
  def test_corr_rank_ordered_categorical(
  self,
  method,
- ord_cat_series,
+ cat_series,
  other_series,
  ):
- stats = pytest.importorskip("scipy.stats")
- method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
- ord_ser_cat_codes = ord_cat_series.cat.codes.replace(-1, np.nan)
-
- if other_series.dtype == "category" and other_series.cat.ordered:
- other_series = other_series.cat.codes.replace(-1, np.nan)
-
- corr_calc = ord_cat_series.corr(other_series, method=method)
- corr_expected = method_scipy_func[method](
- ord_ser_cat_codes, other_series, nan_policy="omit"
- )[0]
- tm.assert_almost_equal(corr_calc, corr_expected)
+ expected_corr = {"kendall": 0.33333333333333337, "spearman": 0.5}
+ corr_calc = cat_series.corr(other_series, method=method)
+ tm.assert_almost_equal(corr_calc, expected_corr[method])