-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: Merge DataFrame and Series using on (GH21220) #21223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
91a3f16 64e6e66 8939fad b09ffb7 1a0e5a2 59b487d 0d14cc2 12da2ce 2d689ad 687568b 7ed9688 2ce56e7 ac9d5a1 fdf324c e896a91 69d8bda f7b333f c4c988c d099fd6 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -11,7 +11,7 @@ | |
| import pandas.compat as compat | ||
| | ||
| from pandas import (Categorical, DataFrame, | ||
| Index, MultiIndex, Timedelta) | ||
| Index, MultiIndex, Timedelta, Series) | ||
| from pandas.core.arrays.categorical import _recode_for_categories | ||
| from pandas.core.frame import _merge_doc | ||
| from pandas.core.dtypes.common import ( | ||
| | @@ -493,6 +493,8 @@ def __init__(self, left, right, how='inner', on=None, | |
| left_index=False, right_index=False, sort=True, | ||
| suffixes=('_x', '_y'), copy=True, indicator=False, | ||
| validate=None): | ||
| left = validate_operand(left) | ||
| right = validate_operand(right) | ||
| self.left = self.orig_left = left | ||
| self.right = self.orig_right = right | ||
| self.how = how | ||
| | @@ -519,13 +521,6 @@ def __init__(self, left, right, how='inner', on=None, | |
| raise ValueError( | ||
| 'indicator option can only accept boolean or string arguments') | ||
| | ||
| if not isinstance(left, DataFrame): | ||
| raise ValueError('can not merge DataFrame with instance of ' | ||
| 'type {left}'.format(left=type(left))) | ||
| if not isinstance(right, DataFrame): | ||
| raise ValueError('can not merge DataFrame with instance of ' | ||
| 'type {right}'.format(right=type(right))) | ||
| | ||
| if not is_bool(left_index): | ||
| raise ValueError( | ||
| 'left_index parameter must be of type bool, not ' | ||
| | @@ -1645,3 +1640,16 @@ def _should_fill(lname, rname): | |
| | ||
| def _any(x): | ||
| return x is not None and com._any_not_none(*x) | ||
| | ||
| | ||
| def validate_operand(obj): | ||
| if isinstance(obj, DataFrame): | ||
| return obj | ||
| elif isinstance(obj, Series): | ||
| if obj.name is None: | ||
| raise ValueError('Cannot merge a Series without a name') | ||
| else: | ||
| return obj.to_frame() | ||
| else: | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just raise, and more like
Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks - done - updated the statement and made into a TypeError Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason you changed this to TypeError? Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche Thanks - I thought TypeError is more relevant for the statement below, as the message is telling the user "only objects of type Series or DataFrame can be merged" Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the one is necessarily much better than the other in this case, so I would personally rather keep it as it was to be conservative. Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although eg Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok - not changing then and keeping it (i.e. TypeError) | ||
| raise TypeError('Can only merge Series or DataFrame objects, ' | ||
| 'a {obj} was passed'.format(obj=type(obj))) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -228,16 +228,19 @@ def test_join_on_fails_with_different_column_counts(self): | |
| index=tm.makeCustomIndex(10, 2)) | ||
| merge(df, df2, right_on='a', left_on=['a', 'b']) | ||
| | ||
| def test_join_on_fails_with_wrong_object_type(self): | ||
| # GH12081 | ||
| wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] | ||
| df = DataFrame({'a': [1, 1]}) | ||
| @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) | ||
| def test_join_on_fails_with_wrong_object_type(self, wrong_type): | ||
| # GH12081 - original issue | ||
| | ||
| # GH21220 - merging of Series and DataFrame is now allowed | ||
| # Edited the test to remove the Series object from test parameters | ||
| # Also, parameterized the original test | ||
| ||
| | ||
| for obj in wrongly_typed: | ||
| with tm.assert_raises_regex(ValueError, str(type(obj))): | ||
| merge(obj, df, left_on='a', right_on='a') | ||
| with tm.assert_raises_regex(ValueError, str(type(obj))): | ||
| merge(df, obj, left_on='a', right_on='a') | ||
| df = DataFrame({'a': [1, 1]}) | ||
| with tm.assert_raises_regex(TypeError, str(type(wrong_type))): | ||
| merge(wrong_type, df, left_on='a', right_on='a') | ||
| with tm.assert_raises_regex(TypeError, str(type(wrong_type))): | ||
| merge(df, wrong_type, left_on='a', right_on='a') | ||
| | ||
| def test_join_on_pass_vector(self): | ||
| expected = self.target.join(self.source, on='C') | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -1887,3 +1887,33 @@ def test_merge_index_types(index): | |
| OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]), | ||
| index=index) | ||
| assert_frame_equal(result, expected) | ||
| | ||
| | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also parameterize this as much as possible. Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done | ||
| @pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nms,nm", [ | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you format this so it lines up, its pretty unreadable now Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks - done | ||
| (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], 'B'), | ||
| (None, None, None, True, True, ['outer', 'inner'], 'B'), | ||
| (None, ['outer', 'inner'], None, False, True, None, 'B'), | ||
| (None, None, ['outer', 'inner'], True, False, None, 'B'), | ||
| (['outer', 'inner'], None, None, False, False, ['outer', 'inner'], None), | ||
| (None, None, None, True, True, ['outer', 'inner'], None), | ||
| (None, ['outer', 'inner'], None, False, True, None, None), | ||
| (None, None, ['outer', 'inner'], True, False, None, None)]) | ||
| def test_merge_series(on, left_on, right_on, left_index, right_index, nms, nm): | ||
| # GH 21220 | ||
| a = pd.DataFrame({"A": [1, 2, 3, 4]}, | ||
| index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]], | ||
| names=['outer', 'inner'])) | ||
| b = pd.Series([1, 2, 3, 4], | ||
| index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]], | ||
| names=['outer', 'inner']), name=nm) | ||
| expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]}, | ||
| index=pd.MultiIndex.from_product([['a', 'b'], [1]], | ||
| names=nms)) | ||
| if nm is not None: | ||
| result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, | ||
| left_index=left_index, right_index=right_index) | ||
| tm.assert_frame_equal(result, expected) | ||
| else: | ||
| with tm.assert_raises_regex(ValueError, 'a Series without a name'): | ||
| result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, | ||
| left_index=left_index, right_index=right_index) | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As of my prior commit 69d8bda initially this only said
right : DataFrame, not sure when the part arounddictwas added. Have removed mention ofdictas this PR only ensures thatDataFrameandSeriescan be mergedThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have not tested merge with dict - as recently changed local machine - is this needed in this PR?