Skip to content

Commit 83f3ca1

Browse files
committed
Merge branch 'master' into ref-replace-2
2 parents ddd71b6 + 97b2890 commit 83f3ca1

File tree

11 files changed

+448
-455
lines changed

11 files changed

+448
-455
lines changed

.github/workflows/ci.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,40 @@ jobs:
7878
run: pytest scripts
7979
if: always()
8080

81+
benchmarks:
82+
name: Benchmarks
83+
runs-on: ubuntu-latest
84+
defaults:
85+
run:
86+
shell: bash -l {0}
87+
88+
concurrency:
89+
# https://github.community/t/concurrecy-not-work-for-push/183068/7
90+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-benchmarks
91+
cancel-in-progress: true
92+
93+
steps:
94+
- name: Checkout
95+
uses: actions/checkout@v2
96+
with:
97+
fetch-depth: 0
98+
99+
- name: Cache conda
100+
uses: actions/cache@v2
101+
with:
102+
path: ~/conda_pkgs_dir
103+
key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }}
104+
105+
- uses: conda-incubator/setup-miniconda@v2
106+
with:
107+
activate-environment: pandas-dev
108+
channel-priority: strict
109+
environment-file: ${{ env.ENV_FILE }}
110+
use-only-tar-bz2: true
111+
112+
- name: Build Pandas
113+
uses: ./.github/actions/build_pandas
114+
81115
- name: Running benchmarks
82116
run: |
83117
cd asv_bench

asv_bench/benchmarks/io/csv.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,26 @@ def time_frame(self, kind):
5555
self.df.to_csv(self.fname)
5656

5757

58+
class ToCSVMultiIndexUnusedLevels(BaseIO):
59+
60+
fname = "__test__.csv"
61+
62+
def setup(self):
63+
df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1})
64+
self.df = df.set_index(["a", "b"])
65+
self.df_unused_levels = self.df.iloc[:10_000]
66+
self.df_single_index = df.set_index(["a"]).iloc[:10_000]
67+
68+
def time_full_frame(self):
69+
self.df.to_csv(self.fname)
70+
71+
def time_sliced_frame(self):
72+
self.df_unused_levels.to_csv(self.fname)
73+
74+
def time_single_index_frame(self):
75+
self.df_single_index.to_csv(self.fname)
76+
77+
5878
class ToCSVDatetime(BaseIO):
5979

6080
fname = "__test__.csv"
@@ -78,6 +98,9 @@ def setup(self):
7898
def time_frame_date_formatting_index(self):
7999
self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S")
80100

101+
def time_frame_date_no_format_index(self):
102+
self.data.to_csv(self.fname)
103+
81104

82105
class ToCSVDatetimeBig(BaseIO):
83106

doc/source/whatsnew/v1.4.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,7 @@ Performance improvements
592592
- Performance improvement in :meth:`Series.mad` (:issue:`43010`)
593593
- Performance improvement in :func:`merge` (:issue:`43332`)
594594
- Performance improvement in :func:`to_csv` when index column is a datetime and is formatted (:issue:`39413`)
595+
- Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`)
595596
- Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
596597
- Performance improvement in :func:`concat` (:issue:`43354`)
597598
-
@@ -757,6 +758,7 @@ I/O
757758
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
758759
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
759760
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
761+
- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`)
760762
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
761763
- Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
762764
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)

pandas/_libs/parsers.pyx

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1086,8 +1086,27 @@ cdef class TextReader:
10861086
break
10871087

10881088
# we had a fallback parse on the dtype, so now try to cast
1089-
# only allow safe casts, eg. with a nan you cannot safely cast to int
10901089
if col_res is not None and col_dtype is not None:
1090+
# If col_res is bool, it might actually be a bool array mixed with NaNs
1091+
# (see _try_bool_flex()). Usually this would be taken care of using
1092+
# _maybe_upcast(), but if col_dtype is a floating type we should just
1093+
# take care of that cast here.
1094+
if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
1095+
mask = col_res.view(np.uint8) == na_values[np.uint8]
1096+
col_res = col_res.astype(col_dtype)
1097+
np.putmask(col_res, mask, np.nan)
1098+
return col_res, na_count
1099+
1100+
# NaNs are already cast to True here, so can not use astype
1101+
if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
1102+
if na_count > 0:
1103+
raise ValueError(
1104+
f"cannot safely convert passed user dtype of "
1105+
f"{col_dtype} for {np.bool_} dtyped data in "
1106+
f"column {i} due to NA values"
1107+
)
1108+
1109+
# only allow safe casts, eg. with a nan you cannot safely cast to int
10911110
try:
10921111
col_res = col_res.astype(col_dtype, casting='safe')
10931112
except TypeError:

pandas/_testing/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,10 @@
8989
assert_timedelta_array_equal,
9090
raise_assert_detail,
9191
)
92-
from pandas._testing.compat import get_dtype # noqa:F401
92+
from pandas._testing.compat import ( # noqa:F401
93+
get_dtype,
94+
get_obj,
95+
)
9396
from pandas._testing.contexts import ( # noqa:F401
9497
RNGContext,
9598
decompress_file,

pandas/_testing/compat.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,13 @@ def get_dtype(obj):
1111
return obj.dtypes.iat[0]
1212
else:
1313
return obj.dtype
14+
15+
16+
def get_obj(df: DataFrame, klass):
17+
"""
18+
For sharing tests using frame_or_series, either return the DataFrame
19+
unchanged or return it's first column as a Series.
20+
"""
21+
if klass is DataFrame:
22+
return df
23+
return df._ixs(0, axis=1)

pandas/io/formats/csvs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ def data_index(self) -> Index:
186186
data_index = Index(
187187
[x.strftime(self.date_format) if notna(x) else "" for x in data_index]
188188
)
189+
elif isinstance(data_index, ABCMultiIndex):
190+
data_index = data_index.remove_unused_levels()
189191
return data_index
190192

191193
@property

pandas/io/parsers/arrow_parser_wrapper.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
130130
frame.index.names = [None] * len(frame.index.names)
131131

132132
if self.kwds.get("dtype") is not None:
133-
frame = frame.astype(self.kwds.get("dtype"))
133+
try:
134+
frame = frame.astype(self.kwds.get("dtype"))
135+
except TypeError as e:
136+
# GH#44901 reraise to keep api consistent
137+
raise ValueError(e)
134138
return frame
135139

136140
def read(self) -> DataFrame:

0 commit comments

Comments
 (0)