pandas-dev · mroeschke · Oct 15, 2023 · Oct 15, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -458,6 +458,7 @@ Other
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
+- Bug in :meth:`Series.unique` returning incorrect value for unique, non-UTF8 encodeable strings (:issue:`45929`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 
 .. ***DO NOT USE THIS SECTION***

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1179,6 +1179,8 @@ cdef class StringHashTable(HashTable):
  use_na_value = na_value is not None
 
  # assign pointers and pre-filter out missing (if ignore_na)
+ # https://cython.readthedocs.io/en/latest/src/userguide/language_basics.html#caveats-when-using-a-python-string-in-a-c-context
+ keep_bad_unicode_refs = []
  vecs = <const char **>malloc(n * sizeof(char *))
  if vecs is NULL:
  raise MemoryError()
@@ -1197,7 +1199,9 @@ cdef class StringHashTable(HashTable):
  try:
  v = get_c_string(<str>val)
  except UnicodeEncodeError:
- v = get_c_string(<str>repr(val))
+ rval = <str>repr(val)
+ keep_bad_unicode_refs.append(rval)
+ v = get_c_string(rval)
  vecs[i] = v
 
  # compute
@@ -1223,6 +1227,8 @@ cdef class StringHashTable(HashTable):
  idx = self.table.vals[k]
  labels[i] = idx
 
+ keep_bad_unicode_refs.clear()
+ del keep_bad_unicode_refs
  free(vecs)
 
  # uniques

diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py
@@ -99,7 +99,6 @@ def test_nunique_null(null_obj, index_or_series_obj):
  assert obj.nunique(dropna=False) == max(0, num_unique_values)
 
 
-@pytest.mark.single_cpu
 @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
 def test_unique_bad_unicode(index_or_series):
  # regression test for #34550
@@ -116,6 +115,27 @@ def test_unique_bad_unicode(index_or_series):
  tm.assert_numpy_array_equal(result, expected)
 
 
+def test_unique_bad_unicode2(index_or_series):
+ # regression test for #45929
+ data_list = [
+ "1 \udcd6a NY",
+ "2 \udcd6b NY",
+ "3 \ud800c NY",
+ "4 \udcd6d NY",
+ "5 \udcc3e NY",
+ ]
+
+ obj = index_or_series(data_list)
+ result = obj.unique()
+ if isinstance(obj, pd.Index):
+ expected = pd.Index(data_list, dtype=object)
+ tm.assert_index_equal(result, expected)
+ else:
+ expected = np.array(data_list, dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("dropna", [True, False])
 def test_nunique_dropna(dropna):
  # GH37566
  ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])