pandas-dev
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 34 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎asv_bench/benchmarks/io/csv.py‎
Lines changed: 23 additions & 0 deletions b/‎asv_bench/benchmarks/io/csv.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎doc/source/whatsnew/v1.4.0.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/whatsnew/v1.4.0.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/_libs/parsers.pyx‎
Lines changed: 20 additions & 1 deletion b/‎pandas/_libs/parsers.pyx‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎pandas/_testing/__init__.py‎
Lines changed: 4 additions & 1 deletion b/‎pandas/_testing/__init__.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pandas/_testing/compat.py‎
Lines changed: 10 additions & 0 deletions b/‎pandas/_testing/compat.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎pandas/io/formats/csvs.py‎
Lines changed: 2 additions & 0 deletions b/‎pandas/io/formats/csvs.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/io/parsers/arrow_parser_wrapper.py‎
Lines changed: 5 additions & 1 deletion b/‎pandas/io/parsers/arrow_parser_wrapper.py‎
Lines changed: 5 additions & 1 deletion
@@ -78,6 +78,40 @@ jobs:
  run: pytest scripts
  if: always()
 
+ benchmarks:
+ name: Benchmarks
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ shell: bash -l {0}
+
+ concurrency:
+ # https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-benchmarks
+ cancel-in-progress: true
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Cache conda
+ uses: actions/cache@v2
+ with:
+ path: ~/conda_pkgs_dir
+ key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }}
+
+ - uses: conda-incubator/setup-miniconda@v2
+ with:
+ activate-environment: pandas-dev
+ channel-priority: strict
+ environment-file: ${{ env.ENV_FILE }}
+ use-only-tar-bz2: true
+
+ - name: Build Pandas
+ uses: ./.github/actions/build_pandas
+
  - name: Running benchmarks
  run: |
  cd asv_bench
 
@@ -55,6 +55,26 @@ def time_frame(self, kind):
  self.df.to_csv(self.fname)
 
 
+class ToCSVMultiIndexUnusedLevels(BaseIO):
+
+ fname = "__test__.csv"
+
+ def setup(self):
+ df = DataFrame({"a": np.random.randn(100_000), "b": 1, "c": 1})
+ self.df = df.set_index(["a", "b"])
+ self.df_unused_levels = self.df.iloc[:10_000]
+ self.df_single_index = df.set_index(["a"]).iloc[:10_000]
+
+ def time_full_frame(self):
+ self.df.to_csv(self.fname)
+
+ def time_sliced_frame(self):
+ self.df_unused_levels.to_csv(self.fname)
+
+ def time_single_index_frame(self):
+ self.df_single_index.to_csv(self.fname)
+
+
 class ToCSVDatetime(BaseIO):
 
  fname = "__test__.csv"
@@ -78,6 +98,9 @@ def setup(self):
  def time_frame_date_formatting_index(self):
  self.data.to_csv(self.fname, date_format="%Y-%m-%d %H:%M:%S")
 
+ def time_frame_date_no_format_index(self):
+ self.data.to_csv(self.fname)
+
 
 class ToCSVDatetimeBig(BaseIO):
 
 
@@ -592,6 +592,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.mad` (:issue:`43010`)
 - Performance improvement in :func:`merge` (:issue:`43332`)
 - Performance improvement in :func:`to_csv` when index column is a datetime and is formatted (:issue:`39413`)
+- Performance improvement in :func:`to_csv` when :class:`MultiIndex` contains a lot of unused levels (:issue:`37484`)
 - Performance improvement in :func:`read_csv` when ``index_col`` was set with a numeric column (:issue:`44158`)
 - Performance improvement in :func:`concat` (:issue:`43354`)
 -
@@ -757,6 +758,7 @@ I/O
 - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
 - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
+- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`)
 - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
 - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
 - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
 
@@ -1086,8 +1086,27 @@ cdef class TextReader:
  break
 
  # we had a fallback parse on the dtype, so now try to cast
- # only allow safe casts, eg. with a nan you cannot safely cast to int
  if col_res is not None and col_dtype is not None:
+ # If col_res is bool, it might actually be a bool array mixed with NaNs
+ # (see _try_bool_flex()). Usually this would be taken care of using
+ # _maybe_upcast(), but if col_dtype is a floating type we should just
+ # take care of that cast here.
+ if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
+ mask = col_res.view(np.uint8) == na_values[np.uint8]
+ col_res = col_res.astype(col_dtype)
+ np.putmask(col_res, mask, np.nan)
+ return col_res, na_count
+
+ # NaNs are already cast to True here, so can not use astype
+ if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
+ if na_count > 0:
+ raise ValueError(
+ f"cannot safely convert passed user dtype of "
+ f"{col_dtype} for {np.bool_} dtyped data in "
+ f"column {i} due to NA values"
+ )
+
+ # only allow safe casts, eg. with a nan you cannot safely cast to int
  try:
  col_res = col_res.astype(col_dtype, casting='safe')
  except TypeError:
 
@@ -89,7 +89,10 @@
  assert_timedelta_array_equal,
  raise_assert_detail,
 )
-from pandas._testing.compat import get_dtype # noqa:F401
+from pandas._testing.compat import ( # noqa:F401
+ get_dtype,
+ get_obj,
+)
 from pandas._testing.contexts import ( # noqa:F401
  RNGContext,
  decompress_file,
 
@@ -11,3 +11,13 @@ def get_dtype(obj):
  return obj.dtypes.iat[0]
  else:
  return obj.dtype
+
+
+def get_obj(df: DataFrame, klass):
+ """
+ For sharing tests using frame_or_series, either return the DataFrame
+ unchanged or return it's first column as a Series.
+ """
+ if klass is DataFrame:
+ return df
+ return df._ixs(0, axis=1)
@@ -186,6 +186,8 @@ def data_index(self) -> Index:
  data_index = Index(
  [x.strftime(self.date_format) if notna(x) else "" for x in data_index]
  )
+ elif isinstance(data_index, ABCMultiIndex):
+ data_index = data_index.remove_unused_levels()
  return data_index
 
  @property
 
@@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
  frame.index.names = [None] * len(frame.index.names)
 
  if self.kwds.get("dtype") is not None:
- frame = frame.astype(self.kwds.get("dtype"))
+ try:
+ frame = frame.astype(self.kwds.get("dtype"))
+ except TypeError as e:
+ # GH#44901 reraise to keep api consistent
+ raise ValueError(e)
  return frame
 
  def read(self) -> DataFrame:
Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,8 @@ def data_index(self) -> Index:`
`186`	`186`	`data_index = Index(`
`187`	`187`	`[x.strftime(self.date_format) if notna(x) else "" for x in data_index]`
`188`	`188`	`)`
	`189`	`+ elif isinstance(data_index, ABCMultiIndex):`
	`190`	`+ data_index = data_index.remove_unused_levels()`
`189`	`191`	`return data_index`
`190`	`192`
`191`	`193`	`@property`