pandas-dev · rhshadrach · Nov 22, 2022 · Apr 22, 2022 · Apr 22, 2022 · Apr 22, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -960,6 +960,7 @@ Performance improvements
 - Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
+- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
 - Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)
 - Performance improvement in :meth:`.Styler.to_excel` when applying repeated CSS formats (:issue:`47371`)
 - Performance improvement in :meth:`MultiIndex.is_monotonic_increasing` (:issue:`47458`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1828,6 +1828,28 @@ def to_numpy(
 
  return result
 
+ def _create_data_for_split_and_tight_to_dict(
+ self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
+ ) -> list:
+ """
+ Simple helper method to create data for to ``to_dict(orient="split")`` and
+ ``to_dict(orient="tight")`` to create the main output data
+ """
+ if are_all_object_dtype_cols:
+ data = [
+ list(map(maybe_box_native, t))
+ for t in self.itertuples(index=False, name=None)
+ ]
+ else:
+ data = [list(t) for t in self.itertuples(index=False, name=None)]
+ if object_dtype_indices:
+ # If we have object_dtype_cols, apply maybe_box_naive after list
+ # comprehension for perf
+ for row in data:
+ for i in object_dtype_indices:
+ row[i] = maybe_box_native(row[i])
+ return data
+
  @overload
  def to_dict(
  self,
@@ -1967,30 +1989,50 @@ def to_dict(
  "'index=False' is only valid when 'orient' is 'split' or 'tight'"
  )
 
+ if orient == "series":
+ # GH46470 Return quickly if orient series to avoid creating dtype objects
+ return into_c((k, v) for k, v in self.items())
+
+ object_dtype_indices = [
+ i
+ for i, col_dtype in enumerate(self.dtypes.values)
+ if is_object_dtype(col_dtype)
+ ]
+ are_all_object_dtype_cols = len(object_dtype_indices) == len(self.dtypes)
+
  if orient == "dict":
  return into_c((k, v.to_dict(into)) for k, v in self.items())
 
  elif orient == "list":
+ object_dtype_indices_as_set = set(object_dtype_indices)
  return into_c(
- (k, list(map(maybe_box_native, v.tolist()))) for k, v in self.items()
+ (
+ k,
+ list(map(maybe_box_native, v.tolist()))
+ if i in object_dtype_indices_as_set
+ else v.tolist(),
+ )
+ for i, (k, v) in enumerate(self.items())
  )
 
  elif orient == "split":
+ data = self._create_data_for_split_and_tight_to_dict(
+ are_all_object_dtype_cols, object_dtype_indices
+ )
+
  return into_c(
  ((("index", self.index.tolist()),) if index else ())
  + (
  ("columns", self.columns.tolist()),
- (
- "data",
- [
- list(map(maybe_box_native, t))
- for t in self.itertuples(index=False, name=None)
- ],
- ),
+ ("data", data),
  )
  )
 
  elif orient == "tight":
+ data = self._create_data_for_split_and_tight_to_dict(
+ are_all_object_dtype_cols, object_dtype_indices
+ )
+
  return into_c(
  ((("index", self.index.tolist()),) if index else ())
  + (
@@ -2007,26 +2049,65 @@ def to_dict(
  + (("column_names", list(self.columns.names)),)
  )
 
- elif orient == "series":
- return into_c((k, v) for k, v in self.items())
-
  elif orient == "records":
  columns = self.columns.tolist()
- rows = (
- dict(zip(columns, row))
- for row in self.itertuples(index=False, name=None)
- )
- return [
- into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
- ]
+ if are_all_object_dtype_cols:
+ rows = (
+ dict(zip(columns, row))
+ for row in self.itertuples(index=False, name=None)
+ )
+ return [
+ into_c((k, maybe_box_native(v)) for k, v in row.items())
+ for row in rows
+ ]
+ else:
+ data = [
+ into_c(zip(columns, t))
+ for t in self.itertuples(index=False, name=None)
+ ]
+ if object_dtype_indices:
+ object_dtype_indices_as_set = set(object_dtype_indices)
+ object_dtype_cols = {
+ col
+ for i, col in enumerate(self.columns)
+ if i in object_dtype_indices_as_set
+ }
+ for row in data:
+ for col in object_dtype_cols:
+ row[col] = maybe_box_native(row[col])
+ return data
 
  elif orient == "index":
  if not self.index.is_unique:
  raise ValueError("DataFrame index must be unique for orient='index'.")
- return into_c(
- (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
- for t in self.itertuples(name=None)
- )
+ columns = self.columns.tolist()
+ if are_all_object_dtype_cols:
+ return into_c(
+ (t[0], dict(zip(self.columns, map(maybe_box_native, t[1:]))))
+ for t in self.itertuples(name=None)
+ )
+ elif object_dtype_indices:
+ object_dtype_indices_as_set = set(object_dtype_indices)
+ is_object_dtype_by_index = [
+ i in object_dtype_indices_as_set for i in range(len(self.columns))
+ ]
+ return into_c(
+ (
+ t[0],
+ {
+ columns[i]: maybe_box_native(v)
+ if is_object_dtype_by_index[i]
+ else v
+ for i, v in enumerate(t[1:])
+ },
+ )
+ for t in self.itertuples(name=None)
+ )
+ else:
+ return into_c(
+ (t[0], dict(zip(self.columns, t[1:])))
+ for t in self.itertuples(name=None)
+ )
 
  else:
  raise ValueError(f"orient '{orient}' not understood")

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1828,7 +1828,13 @@ def to_dict(self, into: type[dict] = dict) -> dict:
  """
  # GH16122
  into_c = com.standardize_mapping(into)
- return into_c((k, maybe_box_native(v)) for k, v in self.items())
+
+ if is_object_dtype(self):
+ return into_c((k, maybe_box_native(v)) for k, v in self.items())
+ else:
+ # Not an object dtype => all types will be the same so let the default
+ # indexer return native python type
+ return into_c((k, v) for k, v in self.items())
 
  def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
  """

diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py
@@ -379,6 +379,16 @@ def test_to_dict_orient_tight(self, index, columns):
  "b": [float, float, float],
  },
  ),
+ ( # Make sure we have one df which is all object type cols
+ {
+ "a": [1, "hello", 3],
+ "b": [1.1, "world", 3.3],
+ },
+ {
+ "a": [int, str, int],
+ "b": [float, str, float],
+ },
+ ),
  ),
  )
  def test_to_dict_returns_native_types(self, orient, data, expected_types):