Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1320ff1
Fixed #36562
ssche Oct 13, 2020
a1b9385
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 13, 2020
1f76d21
Fixed flake issue
ssche Oct 13, 2020
7076841
fixed linting errors
ssche Oct 13, 2020
225675d
Addressed review comments
ssche Oct 14, 2020
2677166
Make `sort_tuples` last resort sorting strategy to allow faster sorte…
ssche Oct 16, 2020
6f476bc
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 16, 2020
3688238
manually apply pandas black changes (from CI)
ssche Oct 16, 2020
aba429c
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 16, 2020
8ae9279
Modified changelog message as per review comments
ssche Oct 21, 2020
dd5a38d
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 21, 2020
7b7d6f8
Removed pd.xyz
ssche Oct 22, 2020
4226662
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 22, 2020
8cbfa01
forgot `pd`
ssche Oct 22, 2020
6d71000
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 26, 2020
92e1e33
Changed sorting algorithm by using expanded array
ssche Oct 31, 2020
3ada9ce
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 31, 2020
95670f1
Add ticket ref to test case
ssche Oct 31, 2020
d9dcd22
sort import
ssche Oct 31, 2020
66c30c7
Address review comments
ssche Oct 31, 2020
db66528
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 31, 2020
16ae4f4
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Nov 3, 2020
9d03dc3
Fixed: committed file was in merge state
ssche Nov 3, 2020
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Addressed review comments
  • Loading branch information
ssche committed Oct 14, 2020
commit 225675db4433f523699a69042175c55bfd07c99d
32 changes: 15 additions & 17 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2061,32 +2061,32 @@ def sort_tuples(values):
def cmp_func(index_x, index_y):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could do cmp_func(left, right)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes let's change the name and please add typing for cmp_func and sort_tuples. (sort_mixed if you can as well :->)

x = values[index_x]
y = values[index_y]
# shortcut loop in case both tuples are the same
Copy link
Contributor

@jreback jreback Oct 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need to do any of this

see safe_sort

Copy link
Contributor Author

@ssche ssche Oct 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are you saying? what isn't needed?

this PR is changing safe_sort to accommodate mixed-type tuples

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and how does this not do it now!

iirc this is adding a lot of non performing code for the purpose of fixing the error message?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and how does this not do it now!

now a TypeError will be raised in df.combine_first when the df contains a MultiIndex with nan and string values (sort_mixed, which is used now, fails in that case).

iirc this is adding a lot of non performing code for the purpose of fixing the error message?

in the event of all other (more performant) sorters failing, this (unarguably slower) sorter will be used. this should not compromise any other use case's performance, but at least makes the code in the ticket description succeed (slower, but at least not failing).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I moved the slow sort_tuples method so it really isn't called when any of the other (faster) sorters can work (2677166#diff-c8f3ad29eaf121537b999e88e9117f3e3702d0b818a67516da25093fe2890ce8R2114). please have another look and provide feedback.

if x == y:
return 0
len_x = len(x)
len_y = len(y)
for i in range(max(len_x, len_y)):
# lexicographic sorting
for i in range(max(len(x), len(y))):
# check if the tuples have different lengths (shorter tuples
# first)
if i >= len_x:
if i >= len(x):
return -1
if i >= len_y:
if i >= len(y):
return +1
x_i_na = isna(x[i])
y_i_na = isna(y[i])
x_is_na = isna(x[i])
y_is_na = isna(y[i])
# values are the same -> resolve tie with next element
if (x_i_na and y_i_na) or (x[i] == y[i]):
if (x_is_na and y_is_na) or (x[i] == y[i]):
continue
# check for nan values (sort nan to the end which is consistent
# with numpy
if x_i_na and not y_i_na:
# check for nan values (sort nan to the end)
if x_is_na and not y_is_na:
return +1
if not x_i_na and y_i_na:
if not x_is_na and y_is_na:
return -1
# normal greater/less than comparison
if x[i] < y[i]:
return -1
return +1
# both values are the same (should already have been caught)
return 0

ixs = np.arange(len(values))
Expand All @@ -2095,12 +2095,10 @@ def cmp_func(index_x, index_y):

sorter = None

ext_arr = is_extension_array_dtype(values)
if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer":
# unorderable in py3 if mixed str/int
is_ea = is_extension_array_dtype(values)
if not is_ea and lib.infer_dtype(values, skipna=False) == "mixed-integer":
ordered = sort_mixed(values)
elif not ext_arr and values.size and isinstance(values[0], tuple):
# 1-D arrays with tuples of potentially mixed type (solves GH36562)
elif not is_ea and values.size and isinstance(values[0], tuple):
ordered = sort_tuples(values)
else:
try:
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_multiindex_get_loc_list_raises(self):
idx.get_loc([])


def test_combine_first_with_nan_index():
def test_combine_first_with_nan_multiindex():
mi1 = pd.MultiIndex.from_arrays(
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
)
Expand All @@ -102,17 +102,19 @@ def test_combine_first_with_nan_index():
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
)
s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
df_combined = df.combine_first(pd.DataFrame({"col": s}))
res = df.combine_first(pd.DataFrame({"d": s}))
mi_expected = pd.MultiIndex.from_arrays(
[
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
],
names=["a", "b"],
)
assert (df_combined.index == mi_expected).all()
exp_col = np.asarray(
[1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan]
expected = pd.DataFrame(
{
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
},
index=mi_expected,
)
act_col = df_combined["col"].values
assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True)
tm.assert_frame_equal(res, expected)