-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
Refactor string methods for StringArray + return IntegerArray for numeric results #29640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
14ca804 07fc53d ddef378 a147081 36b1bf7 d01ddfe 22a2c79 686de87 0ea2d6b 358ce59 7b25d8d 46dfc20 1b96e53 3b20ffc e4bae5e File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -120,11 +120,11 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): | |
| # should really _check_ for NA | ||
| if is_extension_array_dtype(arr.dtype): | ||
TomAugspurger marked this conversation as resolved. Show resolved Hide resolved | ||
| arr = extract_array(arr) | ||
| return _map_ea(f, arr, na_value=na_result, dtype=dtype) | ||
| return _stringarray_map(f, arr, na_value=na_result, dtype=dtype) | ||
| ||
| return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) | ||
| | ||
| | ||
| def _map_ea( | ||
| def _stringarray_map( | ||
| func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. na_value restricted to scalar? Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I don't think so. | ||
| ) -> ArrayLike: | ||
| from pandas.arrays import IntegerArray, StringArray | ||
| | @@ -160,9 +160,6 @@ def _map_ea( | |
| return StringArray(result) | ||
| # TODO: BooleanArray | ||
| else: | ||
TomAugspurger marked this conversation as resolved. Show resolved Hide resolved | ||
| import pdb | ||
| | ||
| pdb.set_trace() | ||
| return lib.map_infer_mask(arr, func, mask.view("uint8")) | ||
| | ||
| | ||
| | @@ -688,7 +685,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): | |
| raise ValueError("Cannot use a callable replacement when regex=False") | ||
| f = lambda x: x.replace(pat, repl, n) | ||
| | ||
| return _na_map(f, arr) | ||
| return _na_map(f, arr, dtype=str) | ||
| | ||
| | ||
| def str_repeat(arr, repeats): | ||
| | @@ -739,7 +736,7 @@ def scalar_rep(x): | |
| except TypeError: | ||
| return str.__mul__(x, repeats) | ||
| | ||
| return _na_map(scalar_rep, arr) | ||
| return _na_map(scalar_rep, arr, dtype=str) | ||
| else: | ||
| | ||
| def rep(x, r): | ||
| | @@ -1204,7 +1201,7 @@ def str_join(arr, sep): | |
| 4 NaN | ||
| dtype: object | ||
| """ | ||
| return _na_map(sep.join, arr) | ||
| return _na_map(sep.join, arr, dtype=str) | ||
| | ||
| | ||
| def str_findall(arr, pat, flags=0): | ||
| | @@ -1435,7 +1432,7 @@ def str_pad(arr, width, side="left", fillchar=" "): | |
| else: # pragma: no cover | ||
| raise ValueError("Invalid side") | ||
| | ||
| return _na_map(f, arr) | ||
| return _na_map(f, arr, dtype=str) | ||
| | ||
| | ||
| def str_split(arr, pat=None, n=None): | ||
| | @@ -1541,7 +1538,7 @@ def str_slice(arr, start=None, stop=None, step=None): | |
| """ | ||
| obj = slice(start, stop, step) | ||
| f = lambda x: x[obj] | ||
| return _na_map(f, arr) | ||
| return _na_map(f, arr, dtype=str) | ||
| | ||
| | ||
| def str_slice_replace(arr, start=None, stop=None, repl=None): | ||
| | @@ -1632,7 +1629,7 @@ def f(x): | |
| y += x[local_stop:] | ||
| return y | ||
| | ||
| return _na_map(f, arr) | ||
| return _na_map(f, arr, dtype=str) | ||
| | ||
| | ||
| def str_strip(arr, to_strip=None, side="both"): | ||
| | @@ -1657,7 +1654,7 @@ def str_strip(arr, to_strip=None, side="both"): | |
| f = lambda x: x.rstrip(to_strip) | ||
| else: # pragma: no cover | ||
| raise ValueError("Invalid side") | ||
| return _na_map(f, arr) | ||
| return _na_map(f, arr, dtype=str) | ||
| | ||
| | ||
| def str_wrap(arr, width, **kwargs): | ||
| | @@ -1721,7 +1718,7 @@ def str_wrap(arr, width, **kwargs): | |
| | ||
| tw = textwrap.TextWrapper(**kwargs) | ||
| | ||
| return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) | ||
| return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) | ||
| | ||
| | ||
| def str_translate(arr, table): | ||
| | @@ -1741,7 +1738,7 @@ def str_translate(arr, table): | |
| ------- | ||
| Series or Index | ||
| """ | ||
| return _na_map(lambda x: x.translate(table), arr) | ||
| return _na_map(lambda x: x.translate(table), arr, dtype=str) | ||
| | ||
| | ||
| def str_get(arr, i): | ||
| | @@ -3079,7 +3076,7 @@ def normalize(self, form): | |
| import unicodedata | ||
| | ||
| f = lambda x: unicodedata.normalize(form, x) | ||
| result = _na_map(f, self._parent) | ||
| result = _na_map(f, self._parent, dtype=str) | ||
| return self._wrap_result(result) | ||
| | ||
| _shared_docs[ | ||
| | @@ -3277,31 +3274,37 @@ def rindex(self, sub, start=0, end=None): | |
| lambda x: x.lower(), | ||
| name="lower", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["lower"], | ||
| dtype=str, | ||
| ) | ||
| upper = _noarg_wrapper( | ||
| lambda x: x.upper(), | ||
| name="upper", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["upper"], | ||
| dtype=str, | ||
| ) | ||
| title = _noarg_wrapper( | ||
| lambda x: x.title(), | ||
| name="title", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["title"], | ||
| dtype=str, | ||
| ) | ||
| capitalize = _noarg_wrapper( | ||
| lambda x: x.capitalize(), | ||
| name="capitalize", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], | ||
| dtype=str, | ||
| ) | ||
| swapcase = _noarg_wrapper( | ||
| lambda x: x.swapcase(), | ||
| name="swapcase", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], | ||
| dtype=str, | ||
| ) | ||
| casefold = _noarg_wrapper( | ||
| lambda x: x.casefold(), | ||
| name="casefold", | ||
| docstring=_shared_docs["casemethods"] % _doc_args["casefold"], | ||
| dtype=str, | ||
| ) | ||
| | ||
| _shared_docs[ | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you sync the signatures up with _map, also rename _map -> _map_arraylike (or similar)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, don't follow this. I think they do match, aside from @jbrockmendel's request to call it
funcrather thanf.I guess
na_mapcalls itna_resultwhile_mapcalls itna_value.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am really -1 on 2 different branches here. If they have exactly the same signature a little less negative. again I would rename _map to be more informative here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you mean with "-1 on 2 different branches here" ?
The whole purpose of this PR is to add a separate branch in case of StringArray (because we can be more efficient and want to be more specific in the result dtype)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's why my initial PR was so much more complex, since it tried to handle both cases similarly. I think that was more complex than this.
As Joris says, the main point of divergence is that for StringArray we usually know the result dtype exactly. It doesn't depend on the presence of NAs. Additionally,
map_infer_dtypefor both, so the core implementation is the same..stron object-dtype, so we will end up with just this implementation.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, at the very least these signatures for what you call _map and _stringarray_map should be exactly the same.
and _map -> _map_object and _stringarray_map -> _map_stringarray.
I think this is crucial for not adding technical debt.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback is your suggestion to add
na_maskto the _stringarray_map signature and have it just not be used? I think this relates to the "this should be a StringArray method" discussionThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_mapand_na_mapalready have inconsistent signatures. I'm not sure why it's that way on master, but I'm a bit against adding unused arguments in this case.What's the technical debt we're adding here? By definition, we need to handle StringArray differently, since its result type doesn't depend on the presence of NAs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And there is also a good reason for that, as
_maphas an additional argumentna_maskthat is used internally in_map(for a recursive call).I think refactoring
_mapis outside of the scope of this PR.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you create an issue to clean this up (_map and _na_map), and/or refactor of this, post this PR.