Skip to content
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8190,11 +8190,15 @@ def update(
if not isinstance(other, DataFrame):
other = DataFrame(other)

other = other.reindex_like(self)
# reindex rows, non-matching columns get skipped
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure we need this comment

other = other.reindex(self.index)

for col in self.columns:
shared_cols = self.columns.intersection(other.columns)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MarcoGorelli I know this solution kinda side-steps the original issue of null-matching (and reindex introducing an entire NA-column that doesn't need updating I think), but happy to have your thoughts on this solution.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is fine

Something that comes to mind is that it'll still error if other contains a column all of nan, e.g.:

In [1]: df1 = pd.DataFrame({'a': [NaT]}) In [2]: df2 = pd.DataFrame({'a': [np.nan]}) In [3]: df1.update(df2, overwrite=False) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In [3], line 1 ----> 1 df1.update(df2, overwrite=False) File ~/pandas-dev/pandas/core/frame.py:8096, in DataFrame.update(self, other, join, overwrite, filter_func, errors) 8093 if mask.all(): 8094 continue -> 8096 self.loc[:, col] = expressions.where(mask, this, that) File ~/pandas-dev/pandas/core/computation/expressions.py:258, in where(cond, a, b, use_numexpr) 246 """  247 Evaluate the where condition cond on a and b.  248   (...)  255 Whether to try to use numexpr.  256 """ 257 assert _where is not None --> 258 return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) File ~/pandas-dev/pandas/core/computation/expressions.py:188, in _where_numexpr(cond, a, b) 181 result = ne.evaluate( 182 "where(cond_value, a_value, b_value)", 183 local_dict={"cond_value": cond, "a_value": a, "b_value": b}, 184 casting="safe", 185 ) 187 if result is None: --> 188 result = _where_standard(cond, a, b) 190 return result File ~/pandas-dev/pandas/core/computation/expressions.py:172, in _where_standard(cond, a, b) 170 def _where_standard(cond, a, b): 171 # Caller is responsible for extracting ndarray if necessary --> 172 return np.where(cond, a, b) File <__array_function__ internals>:180, in where(*args, **kwargs) TypeError: The DType <class 'numpy.dtype[datetime64]'> could not be promoted by <class 'numpy.dtype[float64]'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtype[datetime64]'>, <class 'numpy.dtype[float64]'>)

but arguably that's desirable behaviour - you wouldn't want to update with a column of an incompatible dtype, regardless of whether its values were all missing or not.

And value-dependent behaviour wouldn't be great, so I like this solution more than the originally-suggested "if all nan then skip"


for col in shared_cols:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's keep it on one line

 for col in self.columns.intersection(other.columns): 
this = self[col]._values
that = other[col]._values

if filter_func is not None:
with np.errstate(all="ignore"):
mask = ~filter_func(this) | isna(that)
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/frame/methods/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,37 @@ def test_update_modify_view(self, using_copy_on_write):
tm.assert_frame_equal(result_view, df2_orig)
else:
tm.assert_frame_equal(result_view, expected)

def test_update_dt_column_with_NaT_create_column(self):
df = DataFrame(
{
"A": [1, None],
"B": [
pd.NaT,
pd.to_datetime("2016-01-01"),
],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we keep this on a single line?

}
)
df2 = DataFrame({"A": [2, 3]})

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's remove all these newlines in the tests

df.update(df2, overwrite=False)

expected = DataFrame(
{"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]}
)

tm.assert_frame_equal(df, expected)

def test_update_dt_column_with_NaT_create_row(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not really sure what this test adds, I'd suggest to either:

  • parametrize over the first test
  • just remove this one

df = DataFrame({"A": [1, None], "B": [pd.to_datetime("2017-1-1"), pd.NaT]})

df2 = DataFrame({"A": [2], "B": [pd.to_datetime("2016-01-01")]})

df.update(df2, overwrite=False)

expected = DataFrame(
{"A": [1, None], "B": [pd.to_datetime("2017-1-1"), pd.NaT]}
)

tm.assert_frame_equal(df, expected)