pandas-dev · WillAyd · Sep 3, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -48,6 +48,7 @@ Other enhancements
 - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
 

diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -10,6 +10,7 @@
  Iterator,
  Sequence,
 )
+from random import shuffle
 from typing import (
  TYPE_CHECKING,
  Any,
@@ -1337,6 +1338,13 @@ def _make_plot(self, fig: Figure) -> None:
  norm, cmap = self._get_norm_and_cmap(c_values, color_by_categorical)
  cb = self._get_colorbar(c_values, c_is_column)
 
+ # if a list of non color strings is passed in as c, generate a list
+ # colored by uniqueness of the strings, such same strings get same color
+ create_colors = not self._are_valid_colors(c_values)
+ if create_colors:
+ custom_color_mapping, c_values = self._uniquely_color_strs(c_values)
+ cb = False # no colorbar; opt for legend
+
  if self.legend:
  label = self.label
  else:
@@ -1367,6 +1375,15 @@ def _make_plot(self, fig: Figure) -> None:
  label, # type: ignore[arg-type]
  )
 
+ # build legend for labeling custom colors
+ if create_colors:
+ ax.legend(
+ handles=[
+ mpl.patches.Circle((0, 0), facecolor=color, label=string)
+ for string, color in custom_color_mapping.items()
+ ]
+ )
+
  errors_x = self._get_errorbars(label=x, index=0, yerr=False)
  errors_y = self._get_errorbars(label=y, index=0, xerr=False)
  if len(errors_x) > 0 or len(errors_y) > 0:
@@ -1390,6 +1407,38 @@ def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool):
  c_values = c
  return c_values
 
+ def _are_valid_colors(self, c_values: np.ndarray | list):
+ # check if c_values contains strings and if these strings are valid mpl colors.
+ # no need to check numerics as these (and mpl colors) will be validated for us
+ # in .Axes.scatter._parse_scatter_color_args(...)
+ try:
+ if len(c_values) and all(isinstance(c, str) for c in c_values):
+ mpl.colors.to_rgba_array(c_values)
+
+ return True
+
+ except (TypeError, ValueError) as _:
+ return False
+
+ def _uniquely_color_strs(
+ self, c_values: np.ndarray | list
+ ) -> tuple[dict, np.ndarray]:
+ # well, almost uniquely color them (up to 949)
+ unique = np.unique(c_values)
+
+ # for up to 7, lets keep colors consistent
+ if len(unique) <= 7:
+ possible_colors = list(mpl.colors.BASE_COLORS.values()) # Hex
+ # explore better ways to handle this case
+ else:
+ possible_colors = list(mpl.colors.XKCD_COLORS.values()) # Hex
+ shuffle(possible_colors)
+
+ colors = [possible_colors[i % len(possible_colors)] for i in range(len(unique))]
+ color_mapping = dict(zip(unique, colors))
+
+ return color_mapping, np.array(list(map(color_mapping.get, c_values)))
+
  def _get_norm_and_cmap(self, c_values, color_by_categorical: bool):
  c = self.c
  if self.colormap is not None:

diff --git a/pandas/tests/plotting/frame/test_frame_color.py b/pandas/tests/plotting/frame/test_frame_color.py
@@ -207,6 +207,21 @@ def test_scatter_with_c_column_name_with_colors(self, cmap):
  ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap)
  assert ax.collections[0].colorbar is None
 
+ def test_scatter_with_c_column_name_without_colors(self):
+ df = DataFrame(
+ {
+ "dataX": range(100),
+ "dataY": range(100),
+ "state": ["NY", "MD", "MA", "CA"] * 25,
+ }
+ )
+ df.plot.scatter("dataX", "dataY", c="state")
+
+ with tm.assert_produces_warning(None):
+ ax = df.plot.scatter(x=0, y=1, c="state")
+
+ assert len(np.unique(ax.collections[0].get_facecolor())) == 4 # 4 states
+
  def test_scatter_colors(self):
  df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
  with pytest.raises(TypeError, match="Specify exactly one of `c` and `color`"):