pandas-dev · TomAugspurger · Oct 13, 2018 · Jul 12, 2018 · Jul 13, 2018 · Jul 13, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -373,6 +373,34 @@ is the case with :attr:`Period.end_time`, for example
 
  p.end_time
 
+.. _whatsnew_0240.api_breaking.sparse_values:
+
+``SparseArray`` is now an ``ExtensionArray``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``SparseArray`` now implements the ``ExtensionArray`` interface (:issue:`21978`, :issue:`19056`, :issue:`22835`).
+To conform to this interface, and for consistency with the rest of pandas, some API breaking
+changes were made:
+
+- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`.
+- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``.
+- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`)
+- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`).
+ * The default value of ``allow_fill`` has changed from ``False`` to ``True``.
+ * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified).
+ * Passing a scalar for ``indices`` is no longer allowed.
+- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``.
+- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
+- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
+
+
+Some new warnings are issued for operations that require or are likely to materialize a large dense array:
+
+- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
+- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used.
+
+In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.
+
 .. _whatsnew_0240.api.datetimelike.normalize:
 
 Tick DateOffset Normalize Restrictions
@@ -621,6 +649,7 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
+- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 
 .. _whatsnew_0240.deprecations:
 
@@ -860,13 +889,6 @@ Groupby/Resample/Rolling
 - :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`)
 - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`)
 
-Sparse
-^^^^^^
-
--
--
--
-
 Reshaping
 ^^^^^^^^^
 
@@ -884,6 +906,20 @@ Reshaping
 - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
 - Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`)
 
+.. _whatsnew_0240.bug_fixes.sparse:
+
+Sparse
+^^^^^^
+
+- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`)
+- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`)
+- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used.
+- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index.
+- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array.
+- A SparseDtype with boolean subtype is considered bool by :meth:`api.types.is_bool_dtype`.
+- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
+- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
+
 Build Changes
 ^^^^^^^^^^^^^
 

diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -68,6 +68,10 @@ cdef class IntIndex(SparseIndex):
  output += 'Indices: %s\n' % repr(self.indices)
  return output
 
+ @property
+ def nbytes(self):
+ return self.indices.nbytes
+
  def check_integrity(self):
  """
  Checks the following:
@@ -359,6 +363,10 @@ cdef class BlockIndex(SparseIndex):
 
  return output
 
+ @property
+ def nbytes(self):
+ return self.blocs.nbytes + self.blengths.nbytes
+
  @property
  def ngaps(self):
  return self.length - self.npoints

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -447,6 +447,8 @@ def unique(self):
  """
  from pandas import unique
 
+ # TODO: Could me more performant by scanning our indices for
+ # the location of the first fill value.
  uniques = unique(self.astype(object))
  return self._from_sequence(uniques, dtype=self.dtype)
 

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -14,7 +14,9 @@
 
 from pandas import compat
 from pandas.compat import iteritems, PY36, OrderedDict
-from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
+from pandas.core.dtypes.generic import (
+ ABCSeries, ABCIndex, ABCIndexClass
+)
 from pandas.core.dtypes.common import (
  is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like
 )

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -12,6 +12,7 @@
  PeriodDtype, IntervalDtype,
  PandasExtensionDtype, ExtensionDtype,
  _pandas_registry)
+from pandas.core.sparse.dtype import SparseDtype
 from pandas.core.dtypes.generic import (
  ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
  ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -180,8 +181,10 @@ def is_sparse(arr):
  >>> is_sparse(bsr_matrix([1, 2, 3]))
  False
  """
+ from pandas.core.sparse.dtype import SparseDtype
 
- return isinstance(arr, (ABCSparseArray, ABCSparseSeries))
+ dtype = getattr(arr, 'dtype', arr)
+ return isinstance(dtype, SparseDtype)
 
 
 def is_scipy_sparse(arr):
@@ -1643,8 +1646,9 @@ def is_bool_dtype(arr_or_dtype):
  True
  >>> is_bool_dtype(pd.Categorical([True, False]))
  True
+ >>> is_bool_dtype(pd.SparseArray([True, False]))
+ True
  """
-
  if arr_or_dtype is None:
  return False
  try:
@@ -1751,6 +1755,8 @@ def is_extension_array_dtype(arr_or_dtype):
  array interface. In pandas, this includes:
 
  * Categorical
+ * Sparse
+ * Interval
 
  Third-party libraries may implement arrays or types satisfying
  this interface as well.
@@ -1873,7 +1879,8 @@ def _get_dtype(arr_or_dtype):
  return PeriodDtype.construct_from_string(arr_or_dtype)
  elif is_interval_dtype(arr_or_dtype):
  return IntervalDtype.construct_from_string(arr_or_dtype)
- elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)):
+ elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex,
+ ABCSparseArray, ABCSparseSeries)):
  return arr_or_dtype.dtype
 
  if hasattr(arr_or_dtype, 'dtype'):
@@ -1921,6 +1928,10 @@ def _get_dtype_type(arr_or_dtype):
  elif is_interval_dtype(arr_or_dtype):
  return Interval
  return _get_dtype_type(np.dtype(arr_or_dtype))
+ elif isinstance(arr_or_dtype, (ABCSparseSeries, ABCSparseArray,
+ SparseDtype)):
+ dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+ return dtype.type
  try:
  return arr_or_dtype.dtype.type
  except AttributeError:

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -93,11 +93,13 @@ def _get_series_result_type(result, objs=None):
 def _get_frame_result_type(result, objs):
  """
  return appropriate class of DataFrame-like concat
- if all blocks are SparseBlock, return SparseDataFrame
+ if all blocks are sparse, return SparseDataFrame
  otherwise, return 1st obj
  """
 
- if result.blocks and all(b.is_sparse for b in result.blocks):
+ if (result.blocks and (
+ all(is_sparse(b) for b in result.blocks) or
+ all(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
  from pandas.core.sparse.api import SparseDataFrame
  return SparseDataFrame
  else:
@@ -554,61 +556,23 @@ def _concat_sparse(to_concat, axis=0, typs=None):
  a single array, preserving the combined dtypes
  """
 
- from pandas.core.sparse.array import SparseArray, _make_index
+ from pandas.core.sparse.array import SparseArray
 
- def convert_sparse(x, axis):
- # coerce to native type
- if isinstance(x, SparseArray):
- x = x.get_values()
- else:
- x = np.asarray(x)
- x = x.ravel()
- if axis > 0:
- x = np.atleast_2d(x)
- return x
+ fill_values = [x.fill_value for x in to_concat
+ if isinstance(x, SparseArray)]
 
- if typs is None:
- typs = get_dtype_kinds(to_concat)
+ if len(set(fill_values)) > 1:
+ raise ValueError("Cannot concatenate SparseArrays with different "
+ "fill values")
 
- if len(typs) == 1:
- # concat input as it is if all inputs are sparse
- # and have the same fill_value
- fill_values = {c.fill_value for c in to_concat}
- if len(fill_values) == 1:
- sp_values = [c.sp_values for c in to_concat]
- indexes = [c.sp_index.to_int_index() for c in to_concat]
-
- indices = []
- loc = 0
- for idx in indexes:
- indices.append(idx.indices + loc)
- loc += idx.length
- sp_values = np.concatenate(sp_values)
- indices = np.concatenate(indices)
- sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index)
-
- return SparseArray(sp_values, sparse_index=sp_index,
- fill_value=to_concat[0].fill_value)
-
- # input may be sparse / dense mixed and may have different fill_value
- # input must contain sparse at least 1
- sparses = [c for c in to_concat if is_sparse(c)]
- fill_values = [c.fill_value for c in sparses]
- sp_indexes = [c.sp_index for c in sparses]
-
- # densify and regular concat
- to_concat = [convert_sparse(x, axis) for x in to_concat]
- result = np.concatenate(to_concat, axis=axis)
-
- if not len(typs - {'sparse', 'f', 'i'}):
- # sparsify if inputs are sparse and dense numerics
- # first sparse input's fill_value and SparseIndex is used
- result = SparseArray(result.ravel(), fill_value=fill_values[0],
- kind=sp_indexes[0])
- else:
- # coerce to object if needed
- result = result.astype('object')
- return result
+ fill_value = list(fill_values)[0]
+
+ # TODO: Fix join unit generation so we aren't passed this.
+ to_concat = [x if isinstance(x, SparseArray)
+ else SparseArray(x.squeeze(), fill_value=fill_value)
+ for x in to_concat]
+
+ return SparseArray._concat_same_type(to_concat)
 
 
 def _concat_rangeindex_same_dtype(indexes):

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -499,6 +499,19 @@ def na_value_for_dtype(dtype, compat=True):
  Returns
  -------
  np.dtype or a pandas dtype
+
+ Examples
+ --------
+ >>> na_value_for_dtype(np.dtype('int64'))
+ 0
+ >>> na_value_for_dtype(np.dtype('int64'), compat=False)
+ nan
+ >>> na_value_for_dtype(np.dtype('float64'))
+ nan
+ >>> na_value_for_dtype(np.dtype('bool'))
+ False
+ >>> na_value_for_dtype(np.dtype('datetime64[ns]'))
+ NaT
  """
  dtype = pandas_dtype(dtype)
 

diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py
@@ -5,7 +5,7 @@
  make_block, # io.pytables, io.packers
  FloatBlock, IntBlock, ComplexBlock, BoolBlock, ObjectBlock,
  TimeDeltaBlock, DatetimeBlock, DatetimeTZBlock,
- CategoricalBlock, ExtensionBlock, SparseBlock, ScalarBlock,
+ CategoricalBlock, ExtensionBlock, ScalarBlock,
  Block)
 from .managers import ( # noqa:F401
  BlockManager, SingleBlockManager,