-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
REF: collect ops dispatch functions in one place, try to de-duplicate SparseDataFrame methods #23060
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF: collect ops dispatch functions in one place, try to de-duplicate SparseDataFrame methods #23060
Changes from 1 commit
c01c19a f0e0a4e 30f3737 5f9d111 f236663 1a556bc bcb1c35 1c9b86b a2d1a56 27c40cb 9835825 9737aee 945beb2 ecaac45 1d08646 11219fe c431373 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,6 @@ | ||
| # -*- coding: utf-8 -*- | ||
| from collections import deque | ||
| from datetime import datetime | ||
| import operator | ||
| | ||
| import pytest | ||
| | @@ -16,28 +18,86 @@ | |
| # Comparisons | ||
| | ||
| class TestFrameComparisons(object): | ||
| def test_flex_comparison_nat(self): | ||
| # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT, | ||
| # and _definitely_ not be NaN | ||
| df = pd.DataFrame([pd.NaT]) | ||
| | ||
| result = df == pd.NaT | ||
| # result.iloc[0, 0] is a np.bool_ object | ||
| assert result.iloc[0, 0].item() is False | ||
| | ||
| result = df.eq(pd.NaT) | ||
| assert result.iloc[0, 0].item() is False | ||
| | ||
| result = df != pd.NaT | ||
| assert result.iloc[0, 0].item() is True | ||
| | ||
| result = df.ne(pd.NaT) | ||
| assert result.iloc[0, 0].item() is True | ||
| # Specifically _not_ flex-comparisons | ||
| | ||
| def test_comparison_invalid(self): | ||
| | ||
| def check(df, df2): | ||
| | ||
| for (x, y) in [(df, df2), (df2, df)]: | ||
| # we expect the result to match Series comparisons for | ||
| # == and !=, inequalities should raise | ||
| result = x == y | ||
| expected = pd.DataFrame({col: x[col] == y[col] | ||
| for col in x.columns}, | ||
| index=x.index, columns=x.columns) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| result = x != y | ||
| expected = pd.DataFrame({col: x[col] != y[col] | ||
| for col in x.columns}, | ||
| index=x.index, columns=x.columns) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| with pytest.raises(TypeError): | ||
| x >= y | ||
| with pytest.raises(TypeError): | ||
| x > y | ||
| with pytest.raises(TypeError): | ||
| x < y | ||
| with pytest.raises(TypeError): | ||
| x <= y | ||
| | ||
| # GH4968 | ||
| # invalid date/int comparisons | ||
| df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a']) | ||
| df['dates'] = pd.date_range('20010101', periods=len(df)) | ||
| | ||
| df2 = df.copy() | ||
| df2['dates'] = df['a'] | ||
| check(df, df2) | ||
| | ||
| df = pd.DataFrame(np.random.randint(10, size=(10, 2)), | ||
| columns=['a', 'b']) | ||
| df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)), | ||
| 'b': pd.date_range('20100101', periods=len(df))}) | ||
| check(df, df2) | ||
| | ||
| def test_timestamp_compare(self): | ||
| # make sure we can compare Timestamps on the right AND left hand side | ||
| # GH#4982 | ||
| df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10), | ||
| 'dates2': pd.date_range('20010102', periods=10), | ||
| 'intcol': np.random.randint(1000000000, size=10), | ||
| 'floatcol': np.random.randn(10), | ||
| 'stringcol': list(tm.rands(10))}) | ||
| df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT | ||
| ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should parameterize if you can Member Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yah, the point of collecting these arithmetic tests is to parametrize/fixturize and especially de-duplicate them in an upcoming pass. | ||
| 'ne': 'ne'} | ||
| | ||
| for left, right in ops.items(): | ||
| left_f = getattr(operator, left) | ||
| right_f = getattr(operator, right) | ||
| | ||
| # no nats | ||
| if left in ['eq', 'ne']: | ||
| expected = left_f(df, pd.Timestamp('20010109')) | ||
| result = right_f(pd.Timestamp('20010109'), df) | ||
| tm.assert_frame_equal(result, expected) | ||
| else: | ||
| with pytest.raises(TypeError): | ||
| left_f(df, pd.Timestamp('20010109')) | ||
| with pytest.raises(TypeError): | ||
| right_f(pd.Timestamp('20010109'), df) | ||
| # nats | ||
| expected = left_f(df, pd.Timestamp('nat')) | ||
| result = right_f(pd.Timestamp('nat'), df) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| def test_mixed_comparison(self): | ||
| # GH 13128, GH 22163 != datetime64 vs non-dt64 should be False, | ||
| # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, | ||
| # not raise TypeError | ||
| # (this appears to be fixed before #22163, not sure when) | ||
| # (this appears to be fixed before GH#22163, not sure when) | ||
| df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]]) | ||
| other = pd.DataFrame([['a', 'b'], ['c', 'd']]) | ||
| | ||
| | @@ -47,17 +107,6 @@ def test_mixed_comparison(self): | |
| result = df != other | ||
| assert result.all().all() | ||
| | ||
| def test_df_boolean_comparison_error(self): | ||
| # GH 4576 | ||
| # boolean comparisons with a tuple/list give unexpected results | ||
| df = pd.DataFrame(np.arange(6).reshape((3, 2))) | ||
| | ||
| # not shape compatible | ||
| with pytest.raises(ValueError): | ||
| df == (2, 2) | ||
| with pytest.raises(ValueError): | ||
| df == [2, 2] | ||
| | ||
| def test_df_float_none_comparison(self): | ||
| df = pd.DataFrame(np.random.randn(8, 3), index=range(8), | ||
| columns=['A', 'B', 'C']) | ||
| | @@ -75,6 +124,148 @@ def test_df_string_comparison(self): | |
| tm.assert_frame_equal(df[mask_b], df.loc[0:0, :]) | ||
| tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :]) | ||
| | ||
| def test_df_boolean_comparison_error(self): | ||
| # GH#4576 | ||
| # boolean comparisons with a tuple/list give unexpected results | ||
| df = pd.DataFrame(np.arange(6).reshape((3, 2))) | ||
| | ||
| # not shape compatible | ||
| with pytest.raises(ValueError): | ||
| df == (2, 2) | ||
| with pytest.raises(ValueError): | ||
| df == [2, 2] | ||
| | ||
| | ||
| class TestFrameFlexComparisons(object): | ||
| # TODO: test_bool_flex_frame needs a better name | ||
| def test_bool_flex_frame(self): | ||
| data = np.random.randn(5, 3) | ||
| other_data = np.random.randn(5, 3) | ||
| df = pd.DataFrame(data) | ||
| other = pd.DataFrame(other_data) | ||
| ndim_5 = np.ones(df.shape + (1, 3)) | ||
| | ||
| # Unaligned | ||
| def _check_unaligned_frame(meth, op, df, other): | ||
| part_o = other.loc[3:, 1:].copy() | ||
| rs = meth(part_o) | ||
| xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) | ||
| tm.assert_frame_equal(rs, xp) | ||
| | ||
| # DataFrame | ||
| assert df.eq(df).values.all() | ||
| assert not df.ne(df).values.any() | ||
| for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needs paramaterization! | ||
| f = getattr(df, op) | ||
| o = getattr(operator, op) | ||
| # No NAs | ||
| tm.assert_frame_equal(f(other), o(df, other)) | ||
| _check_unaligned_frame(f, o, df, other) | ||
| # ndarray | ||
| tm.assert_frame_equal(f(other.values), o(df, other.values)) | ||
| # scalar | ||
| tm.assert_frame_equal(f(0), o(df, 0)) | ||
| # NAs | ||
| msg = "Unable to coerce to Series/DataFrame" | ||
| tm.assert_frame_equal(f(np.nan), o(df, np.nan)) | ||
| with tm.assert_raises_regex(ValueError, msg): | ||
| f(ndim_5) | ||
| | ||
| # Series | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pull lthis out to a separatate, parameterized test (future PR is ok for these, though since you are moving around, maybe better here) | ||
| def _test_seq(df, idx_ser, col_ser): | ||
| idx_eq = df.eq(idx_ser, axis=0) | ||
| col_eq = df.eq(col_ser) | ||
| idx_ne = df.ne(idx_ser, axis=0) | ||
| col_ne = df.ne(col_ser) | ||
| tm.assert_frame_equal(col_eq, df == pd.Series(col_ser)) | ||
| tm.assert_frame_equal(col_eq, -col_ne) | ||
| tm.assert_frame_equal(idx_eq, -idx_ne) | ||
| tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) | ||
| tm.assert_frame_equal(col_eq, df.eq(list(col_ser))) | ||
| tm.assert_frame_equal(idx_eq, df.eq(pd.Series(idx_ser), axis=0)) | ||
| tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) | ||
| | ||
| idx_gt = df.gt(idx_ser, axis=0) | ||
| col_gt = df.gt(col_ser) | ||
| idx_le = df.le(idx_ser, axis=0) | ||
| col_le = df.le(col_ser) | ||
| | ||
| tm.assert_frame_equal(col_gt, df > pd.Series(col_ser)) | ||
| tm.assert_frame_equal(col_gt, -col_le) | ||
| tm.assert_frame_equal(idx_gt, -idx_le) | ||
| tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) | ||
| | ||
| idx_ge = df.ge(idx_ser, axis=0) | ||
| col_ge = df.ge(col_ser) | ||
| idx_lt = df.lt(idx_ser, axis=0) | ||
| col_lt = df.lt(col_ser) | ||
| tm.assert_frame_equal(col_ge, df >= pd.Series(col_ser)) | ||
| tm.assert_frame_equal(col_ge, -col_lt) | ||
| tm.assert_frame_equal(idx_ge, -idx_lt) | ||
| tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) | ||
| | ||
| idx_ser = pd.Series(np.random.randn(5)) | ||
| col_ser = pd.Series(np.random.randn(3)) | ||
| _test_seq(df, idx_ser, col_ser) | ||
| | ||
| # list/tuple | ||
| _test_seq(df, idx_ser.values, col_ser.values) | ||
| | ||
| # NA | ||
| df.loc[0, 0] = np.nan | ||
| rs = df.eq(df) | ||
| assert not rs.loc[0, 0] | ||
| rs = df.ne(df) | ||
| assert rs.loc[0, 0] | ||
| rs = df.gt(df) | ||
| assert not rs.loc[0, 0] | ||
| rs = df.lt(df) | ||
| assert not rs.loc[0, 0] | ||
| rs = df.ge(df) | ||
| assert not rs.loc[0, 0] | ||
| rs = df.le(df) | ||
| assert not rs.loc[0, 0] | ||
| | ||
| # complex | ||
| arr = np.array([np.nan, 1, 6, np.nan]) | ||
| arr2 = np.array([2j, np.nan, 7, None]) | ||
| df = pd.DataFrame({'a': arr}) | ||
| df2 = pd.DataFrame({'a': arr2}) | ||
| rs = df.gt(df2) | ||
| assert not rs.values.any() | ||
| rs = df.ne(df2) | ||
| assert rs.values.all() | ||
| | ||
| arr3 = np.array([2j, np.nan, None]) | ||
| df3 = pd.DataFrame({'a': arr3}) | ||
| rs = df3.gt(2j) | ||
| assert not rs.values.any() | ||
| | ||
| # corner, dtype=object | ||
| df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']}) | ||
| df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']}) | ||
| result = df1.ne(df2) | ||
| exp = pd.DataFrame({'col': [False, True, False]}) | ||
| tm.assert_frame_equal(result, exp) | ||
| | ||
| def test_flex_comparison_nat(self): | ||
| # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT, | ||
| # and _definitely_ not be NaN | ||
| df = pd.DataFrame([pd.NaT]) | ||
| | ||
| result = df == pd.NaT | ||
| # result.iloc[0, 0] is a np.bool_ object | ||
| assert result.iloc[0, 0].item() is False | ||
| | ||
| result = df.eq(pd.NaT) | ||
| assert result.iloc[0, 0].item() is False | ||
| | ||
| result = df != pd.NaT | ||
| assert result.iloc[0, 0].item() is True | ||
| | ||
| result = df.ne(pd.NaT) | ||
| assert result.iloc[0, 0].item() is True | ||
| | ||
| @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) | ||
| def test_df_flex_cmp_constant_return_types(self, opname): | ||
| # GH 15077, non-empty DataFrame | ||
| | @@ -375,3 +566,82 @@ def test_td64_df_add_int_frame(self): | |
| df - other | ||
| with pytest.raises(TypeError): | ||
| other - df | ||
| | ||
| def test_arith_mixed(self): | ||
| | ||
| left = pd.DataFrame({'A': ['a', 'b', 'c'], | ||
| 'B': [1, 2, 3]}) | ||
| | ||
| result = left + left | ||
| expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'], | ||
| 'B': [2, 4, 6]}) | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| def test_arith_getitem_commute(self): | ||
| df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) | ||
| | ||
| def _test_op(df, op): | ||
| result = op(df, 1) | ||
| | ||
| if not df.columns.is_unique: | ||
| raise ValueError("Only unique columns supported by this test") | ||
| | ||
| for col in result.columns: | ||
| tm.assert_series_equal(result[col], op(df[col], 1)) | ||
| | ||
| _test_op(df, operator.add) | ||
| _test_op(df, operator.sub) | ||
| _test_op(df, operator.mul) | ||
| _test_op(df, operator.truediv) | ||
| _test_op(df, operator.floordiv) | ||
| _test_op(df, operator.pow) | ||
| | ||
| _test_op(df, lambda x, y: y + x) | ||
| _test_op(df, lambda x, y: y - x) | ||
| _test_op(df, lambda x, y: y * x) | ||
| _test_op(df, lambda x, y: y / x) | ||
| _test_op(df, lambda x, y: y ** x) | ||
| | ||
| _test_op(df, lambda x, y: x + y) | ||
| _test_op(df, lambda x, y: x - y) | ||
| _test_op(df, lambda x, y: x * y) | ||
| _test_op(df, lambda x, y: x / y) | ||
| _test_op(df, lambda x, y: x ** y) | ||
| | ||
| @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]), | ||
| range(1, 3), deque([1, 2])]) | ||
| def test_arith_alignment_non_pandas_object(self, values): | ||
| # GH#17901 | ||
| df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]}) | ||
| expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]}) | ||
| result = df + values | ||
| tm.assert_frame_equal(result, expected) | ||
| | ||
| def test_arith_non_pandas_object(self): | ||
| df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3), | ||
| columns=['one', 'two', 'three'], | ||
| index=['a', 'b', 'c']) | ||
| | ||
| val1 = df.xs('a').values | ||
| added = pd.DataFrame(df.values + val1, | ||
| index=df.index, columns=df.columns) | ||
| tm.assert_frame_equal(df + val1, added) | ||
| | ||
| added = pd.DataFrame((df.values.T + val1).T, | ||
| index=df.index, columns=df.columns) | ||
| tm.assert_frame_equal(df.add(val1, axis=0), added) | ||
| | ||
| val2 = list(df['two']) | ||
| | ||
| added = pd.DataFrame(df.values + val2, | ||
| index=df.index, columns=df.columns) | ||
| tm.assert_frame_equal(df + val2, added) | ||
| | ||
| added = pd.DataFrame((df.values.T + val2).T, index=df.index, | ||
| columns=df.columns) | ||
| tm.assert_frame_equal(df.add(val2, axis='index'), added) | ||
| | ||
| val3 = np.random.rand(*df.shape) | ||
| added = pd.DataFrame(df.values + val3, | ||
| index=df.index, columns=df.columns) | ||
| tm.assert_frame_equal(df.add(val3), added) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you parameterize this (next pass ok)