Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,7 @@ Performance improvements
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to move the note to 2.0

- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
- Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
- Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`)
Expand Down
125 changes: 103 additions & 22 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,6 +1828,28 @@ def to_numpy(

return result

def _create_data_for_split_and_tight_to_dict(
self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
) -> list:
"""
Simple helper method to create data for to ``to_dict(orient="split")`` and
``to_dict(orient="tight")`` to create the main output data
"""
if are_all_object_dtype_cols:
data = [
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
]
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
if object_dtype_indices:
# If we have object_dtype_cols, apply maybe_box_naive after list
# comprehension for perf
for row in data:
for i in object_dtype_indices:
row[i] = maybe_box_native(row[i])
return data

@overload
def to_dict(
self,
Expand Down Expand Up @@ -1967,30 +1989,50 @@ def to_dict(
"'index=False' is only valid when 'orient' is 'split' or 'tight'"
)

if orient == "series":
# GH46470 Return quickly if orient series to avoid creating dtype objects
return into_c((k, v) for k, v in self.items())

object_dtype_indices = [
i
for i, col_dtype in enumerate(self.dtypes.values)
if is_object_dtype(col_dtype)
]
are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes)

if orient == "dict":
return into_c((k, v.to_dict(into)) for k, v in self.items())

elif orient == "list":
object_dtype_indices_as_set = set(object_dtype_indices)
return into_c(
(k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
(
k,
list(map(maybe_box_native, v.tolist()))
if i in object_dtype_indices_as_set
else v.tolist(),
)
for i, (k, v) in enumerate(self.items())
)

elif orient == "split":
data = self._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, object_dtype_indices
)

return into_c(
((("index", self.index.tolist()),) if index else ())
+ (
("columns", self.columns.tolist()),
(
"data",
[
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
],
),
("data", data),
)
)

elif orient == "tight":
data = self._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, object_dtype_indices
)

return into_c(
((("index", self.index.tolist()),) if index else ())
+ (
Expand All @@ -2007,26 +2049,65 @@ def to_dict(
+ (("column_names", list(self.columns.names)),)
)

elif orient == "series":
return into_c((k, v) for k, v in self.items())

elif orient == "records":
columns = self.columns.tolist()
rows = (
dict(zip(columns, row))
for row in self.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
]
if are_all_object_dtype_cols:
rows = (
dict(zip(columns, row))
for row in self.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items())
for row in rows
]
else:
data = [
into_c(zip(columns, t))
for t in self.itertuples(index=False, name=None)
]
if object_dtype_indices:
object_dtype_indices_as_set = set(object_dtype_indices)
object_dtype_cols = {
col
for i, col in enumerate(self.columns)
if i in object_dtype_indices_as_set
}
for row in data:
for col in object_dtype_cols:
row[col] = maybe_box_native(row[col])
return data

elif orient == "index":
if not self.index.is_unique:
raise ValueError("DataFrame index must be unique for orient='index'.")
return into_c(
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
for t in self.itertuples(name=None)
)
columns = self.columns.tolist()
if are_all_object_dtype_cols:
return into_c(
(t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
for t in self.itertuples(name=None)
)
elif object_dtype_indices:
object_dtype_indices_as_set = set(object_dtype_indices)
is_object_dtype_by_index = [
i in object_dtype_indices_as_set for i in range(len(self.columns))
]
return into_c(
(
t[0],
{
columns[i]: maybe_box_native(v)
if is_object_dtype_by_index[i]
else v
for i, v in enumerate(t[1:])
},
)
for t in self.itertuples(name=None)
)
else:
return into_c(
(t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples(name=None)
)

else:
raise ValueError(f"orient '{orient}' not understood")
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1828,7 +1828,13 @@ def to_dict(self, into: type[dict] = dict) -> dict:
"""
# GH16122
into_c = com.standardize_mapping(into)
return into_c((k, maybe_box_native(v)) for k, v in self.items())

if is_object_dtype(self):
return into_c((k, maybe_box_native(v)) for k, v in self.items())
else:
# Not an object dtype => all types will be the same so let the default
# indexer return native python type
return into_c((k, v) for k, v in self.items())

def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
"""
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/methods/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,16 @@ def test_to_dict_orient_tight(self, index, columns):
"b": [float, float, float],
},
),
( # Make sure we have one df which is all object type cols
{
"a": [1, "hello", 3],
"b": [1.1, "world", 3.3],
},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this hits all of the new added code?

{
"a": [int, str, int],
"b": [float, str, float],
},
),
),
)
def test_to_dict_returns_native_types(self, orient, data, expected_types):
Expand Down