Skip to content
Prev Previous commit
Next Next commit
Move test
  • Loading branch information
yuanx749 committed Apr 30, 2024
commit 6f93a8d9a8d50acbbf055b328cc51ccce2018a11
20 changes: 19 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@
except ImportError:
has_pyarrow = False
else:
del pa
has_pyarrow = True

import zoneinfo
Expand Down Expand Up @@ -1367,6 +1366,25 @@ def any_string_dtype(request):
return request.param


@pytest.fixture(
params=[
"object",
"string[python]",
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param(pd.ArrowDtype(pa.string()), marks=td.skip_if_no("pyarrow")),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would actually prefer to just add the pd.ArrowDtype(pa.string()) to the existing string dtypes instead of copying and creating a new fixture. Guessing that causes a lot of other test failures?

Copy link
Member

@jorisvandenbossche jorisvandenbossche Apr 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I forgot that pd.ArrowDtype(pa.string()) was not actually in the fixture, so my suggestion lead you a bit in the wrong way. Sorry!

Right now adding this to the main any_string_dtype fixture will indeed give quite some failures, yes. I agree that it might be better to actually do that (and it would be interesting to see which tests actually fail), but that's for another PR / out of scope for this bug fix (doing so would also require removing some tests are now only exist for the arrow string dtype, to not keep things duplicated).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, a lot of other tests need to be adjusted if adding ArrowDtype to the fixture.

So for this PR, should I just add test in pandas/tests/extension/test_arrow.py?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #58495 so we can track the larger issue

]
)
def any_string_dtype_2(request):
"""
Parametrized fixture for string dtypes.
* 'object'
* 'string[python]'
* 'string[pyarrow]'
"""
return request.param


@pytest.fixture(params=tm.DATETIME64_DTYPES)
def datetime64_dtype(request):
"""
Expand Down
10 changes: 0 additions & 10 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2296,16 +2296,6 @@ def test_str_split_pat_none(method):
tm.assert_series_equal(result, expected)


def test_str_split_regex_none():
# GH 58321
ser = pd.Series(["230/270/270", "240-290-290"], dtype=ArrowDtype(pa.string()))
result = ser.str.split(r"/|-", regex=None)
expected = pd.Series(
ArrowExtensionArray(pa.array([["230", "270", "270"], ["240", "290", "290"]]))
)
tm.assert_series_equal(result, expected)


def test_str_split():
# GH 52401
ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
Expand Down
32 changes: 24 additions & 8 deletions pandas/tests/strings/test_split_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
object_pyarrow_numpy,
)

pa = pytest.importorskip("pyarrow")

from pandas.core.arrays.arrow.array import ArrowExtensionArray


@pytest.mark.parametrize("method", ["split", "rsplit"])
def test_split(any_string_dtype, method):
Expand Down Expand Up @@ -59,27 +63,39 @@ def test_split_regex(any_string_dtype):
tm.assert_series_equal(result, exp)


def test_split_regex_explicit(any_string_dtype):
def test_split_regex_explicit(any_string_dtype_2):
# explicit regex = True split with compiled regex
regex_pat = re.compile(r".jpg")
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
result = values.str.split(regex_pat)
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype_2)

if not isinstance(any_string_dtype_2, pd.ArrowDtype):
# ArrowDtype does not support compiled regex
result = values.str.split(regex_pat)
exp = Series([["xx", "zzz", ""]])
tm.assert_series_equal(result, exp)

# explicit regex = False split
result = values.str.split(r"\.jpg", regex=False)
exp = Series([["xxxjpgzzz.jpg"]])
if not isinstance(any_string_dtype_2, pd.ArrowDtype):
exp = Series([["xxxjpgzzz.jpg"]])
else:
exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz.jpg"]])))
tm.assert_series_equal(result, exp)

# non explicit regex split, pattern length == 1
result = values.str.split(r".")
exp = Series([["xxxjpgzzz", "jpg"]])
if not isinstance(any_string_dtype_2, pd.ArrowDtype):
exp = Series([["xxxjpgzzz", "jpg"]])
else:
exp = Series(ArrowExtensionArray(pa.array([["xxxjpgzzz", "jpg"]])))
tm.assert_series_equal(result, exp)

# non explicit regex split, pattern length != 1
result = values.str.split(r".jpg")
exp = Series([["xx", "zzz", ""]])
if not isinstance(any_string_dtype_2, pd.ArrowDtype):
exp = Series([["xx", "zzz", ""]])
else:
exp = Series(ArrowExtensionArray(pa.array([["xx", "zzz", ""]])))
tm.assert_series_equal(result, exp)

# regex=False with pattern compiled regex raises error
Expand Down