pandas-dev · hexgnu · Jan 26, 2018 · Feb 1, 2018 · Feb 2, 2018 · Feb 2, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -238,6 +238,7 @@ Other Enhancements
 
 - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)
 - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`)
+- :func:`pandas.merge` now supports merging of :class:`SparseDataFrame` (:issue:`13665`)
 
 .. _whatsnew_0230.api_breaking:
 
@@ -555,7 +556,7 @@ Sparse
 
 - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`)
 - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`)
--
+- Bug in :class:`SparseSeries.__array__` returning only non-faills (:issue:`13665`)
 
 Reshaping
 ^^^^^^^^^
@@ -591,3 +592,4 @@ Other
 ^^^^^
 
 - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
+- Improved algorithms.take_1d handling of ``SparseArray`` (:issue:`19506`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1315,6 +1315,11 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
  undefined if allow_fill == False and -1 is present in indexer.
  """
 
+ if is_sparse(arr):
+ return take_nd(arr.get_values(), indexer, axis=axis, out=out,
+ fill_value=fill_value, mask_info=mask_info,
+ allow_fill=allow_fill)
+
  # dispatch to internal type takes
  if is_categorical(arr):
  return arr.take_nd(indexer, fill_value=fill_value,

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3089,6 +3089,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
  # GH#19265 pyarrow is passing this
  warnings.warn("fastpath argument is deprecated, will be removed "
  "in a future release.", DeprecationWarning)
+
  if klass is None:
  dtype = dtype or values.dtype
  klass = get_block_type(values, dtype)
@@ -5304,6 +5305,22 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
  elif is_uniform_join_units(join_units):
  b = join_units[0].block.concat_same_type(
  [ju.block for ju in join_units], placement=placement)
+ elif is_sparse_join_units(join_units):
+ values = concatenate_join_units(join_units, concat_axis, copy=copy)
+
+ if len(values.shape) == 2:
+ values = values[0]
+ else:
+ assert len(values.shape) == 1
+
+ block = join_units[0].block
+
+ if block:
+ fill_value = block.fill_value
+ else:
+ fill_value = np.nan
+ array = SparseArray(values, fill_value=fill_value)
+ b = make_block(array, klass=SparseBlock, placement=placement)
  else:
  b = make_block(
  concatenate_join_units(join_units, concat_axis, copy=copy),
@@ -5313,6 +5330,18 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
  return BlockManager(blocks, axes)
 
 
+def is_sparse_join_units(join_units):
+ """
+ Check if all of the join units are sparse. This leads to building
+ SparseArray over dense array representations so that we can merge
+ SparseSeries / SparseDataFrame
+
+ This is very similar to how pandas.concat works for conatting two
+ SparseDataFrame / SparseSeries
+ """
+ return all(type(ju.block) is SparseBlock for ju in join_units)
+
+
 def is_uniform_join_units(join_units):
  """
  Check if the join units consist of blocks of uniform type that can
@@ -5686,7 +5715,10 @@ def is_na(self):
  def get_reindexed_values(self, empty_dtype, upcasted_na):
  if upcasted_na is None:
  # No upcasting is necessary
- fill_value = self.block.fill_value
+
+ # You would think that you want self.block.fill_value here
+ # But in reality that will fill with a bunch of wrong values
+ fill_value = np.nan
  values = self.block.get_values()
  else:
  fill_value = upcasted_na

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -38,6 +38,8 @@
  concatenate_block_managers)
 from pandas.util._decorators import Appender, Substitution
 
+from pandas.core.sparse.array import SparseArray
+
 from pandas.core.sorting import is_int64_overflow_possible
 import pandas.core.algorithms as algos
 import pandas.core.sorting as sorting
@@ -665,7 +667,6 @@ def _maybe_restore_index_levels(self, result):
  result.set_index(names_to_restore, inplace=True)
 
  def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
-
  left_has_missing = None
  right_has_missing = None
 
@@ -731,7 +732,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
  if mask.all():
  key_col = rvals
  else:
- key_col = Index(lvals).where(~mask, rvals)
+ # Might need to be IntIndex not Index
+ if isinstance(lvals, SparseArray):
+ key_col = Index(lvals.get_values()).where(~mask, rvals)
+ else:
+ key_col = Index(lvals).where(~mask, rvals)
 
  if result._is_label_reference(name):
  result[name] = key_col

diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py
@@ -38,7 +38,6 @@
 from pandas.util._decorators import Appender
 from pandas.core.indexes.base import _index_shared_docs
 
-
 _sparray_doc_kwargs = dict(klass='SparseArray')
 
 
@@ -259,6 +258,7 @@ def __array_wrap__(self, out_arr, context=None):
  ufunc, args, domain = context
  # to apply ufunc only to fill_value (to avoid recursive call)
  args = [getattr(a, 'fill_value', a) for a in args]
+
  with np.errstate(all='ignore'):
  fill_value = ufunc(self.fill_value, *args[1:])
  else:
@@ -292,9 +292,9 @@ def __setstate__(self, state):
  self._fill_value = fill_value
 
  def __len__(self):
- try:
+ if hasattr(self, 'sp_index'):
  return self.sp_index.length
- except:
+ else:
  return 0
 
  def __unicode__(self):

diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -73,6 +73,10 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
  if columns is None:
  raise Exception("cannot pass a series w/o a name or columns")
  data = {columns[0]: data}
+ elif isinstance(data, BlockManager):
+ fill_value_size = len(set(b.fill_value for b in data.blocks))
+ if default_fill_value is None and fill_value_size == 1:
+ default_fill_value = data.blocks[0].fill_value
 
  if default_fill_value is None:
  default_fill_value = np.nan

diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -175,7 +175,7 @@ def values(self):
 
  def __array__(self, result=None):
  """ the array interface, return my values """
- return self.block.values
+ return self.block.values.values
 
  def get_values(self):
  """ same as values """
@@ -271,6 +271,7 @@ def __array_wrap__(self, result, context=None):
 
  See SparseArray.__array_wrap__ for detail.
  """
+
  if isinstance(context, tuple) and len(context) == 3:
  ufunc, args, domain = context
  args = [getattr(a, 'fill_value', a) for a in args]
@@ -279,8 +280,18 @@ def __array_wrap__(self, result, context=None):
  else:
  fill_value = self.fill_value
 
+ # GH 14167
+ # Since we are returning a dense representation of
+ # SparseSeries sparse_index might not align when calling
+ # ufunc on the array. There doesn't seem to be a better way
+ # to do this unfortunately.
+ if len(result) != self.sp_index.npoints:
+ sparse_index = None
+ else:
+ sparse_index = self.sp_index
+
  return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
+ sparse_index=sparse_index,
  fill_value=fill_value,
  copy=False).__finalize__(self)
 
@@ -402,8 +413,8 @@ def abs(self):
  -------
  abs: type of caller
  """
- return self._constructor(np.abs(self.values),
-  index=self.index).__finalize__(self)
+
+ return np.abs(self)
 
  def get(self, label, default=None):
  """
@@ -544,7 +555,7 @@ def to_dense(self, sparse_only=False):
  index = self.index.take(int_index.indices)
  return Series(self.sp_values, index=index, name=self.name)
  else:
- return Series(self.values.to_dense(), index=self.index,
+ return Series(self.get_values(), index=self.index,
  name=self.name)
 
  @property

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -7,6 +7,7 @@
 import numpy as np
 import random
 import re
+import itertools
 
 import pandas as pd
 from pandas.compat import lrange, lzip
@@ -18,7 +19,10 @@
  is_categorical_dtype,
  is_object_dtype,
 )
-from pandas import DataFrame, Index, MultiIndex, Series, Categorical
+from pandas import (
+ DataFrame, Index,
+ MultiIndex, Series, Categorical
+)
 import pandas.util.testing as tm
 from pandas.api.types import CategoricalDtype as CDT
 
@@ -1802,3 +1806,88 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
  how=how,
  sort=sort)
  tm.assert_frame_equal(result, expected)
+
+
+class TestMergeSparseDataFrames(object):
+ @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1,
+ None,
+ np.nan],
+ ['left',
+ 'right',
+ 'outer',
+ 'inner']))
+ def test_merge_two_sparse_frames(self, fill_value, how):
+ dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)),
+ 'B': np.random.randint(0, 100, size=100)})
+ dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)),
+ 'B': np.random.randint(0, 100, size=100)})
+
+ sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
+ sparse_threes = dense_threes.to_sparse(fill_value=fill_value)
+
+ to_merge_sparse = [sparse_evens, sparse_threes]
+
+ to_merge_dense = [dense_evens, dense_threes]
+
+ for _ in range(2):
+ sparse_merge = to_merge_sparse[0].merge(to_merge_sparse[1],
+ how=how, on='A')
+
+ dense_merge = to_merge_dense[0].merge(to_merge_dense[1],
+ how=how, on='A')
+
+ # If you merge two dense frames together it tends to default to
+ # float64 not the original dtype
+ dense_merge['B_x'] = dense_merge['B_x'].astype(dense_evens.A.dtype,
+ errors='ignore')
+ dense_merge['B_y'] = dense_merge['B_y'].astype(dense_evens.A.dtype,
+ errors='ignore')
+
+ if fill_value is None or fill_value is np.nan:
+ assert sparse_merge.default_fill_value is np.nan
+ else:
+ tm.assert_almost_equal(sparse_merge.default_fill_value,
+ fill_value)
+
+ exp = dense_merge.to_sparse(fill_value=fill_value)
+ tm.assert_sp_frame_equal(sparse_merge, exp,
+ exact_indices=False,
+ check_dtype=False)
+
+ to_merge_sparse = to_merge_sparse[::-1]
+ to_merge_dense = to_merge_dense[::-1]
+
+ @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1,
+ None,
+ np.nan],
+ ['left',
+ 'right',
+ 'outer',
+ 'inner']))
+ def test_merge_dense_sparse_frames(self, fill_value, how):
+ fill_value = np.nan
+
+ dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)),
+ 'B': np.random.randint(0, 100, size=100)})
+
+ dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)),
+ 'B': np.random.randint(0, 100, size=100)})
+
+ sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
+
+ to_merge = [sparse_evens, dense_threes]
+ to_merge_dense = [dense_evens, dense_threes]
+
+ for _ in range(2):
+ merged = to_merge[0].merge(to_merge[1], how=how, on='A')
+
+ dense_merge = to_merge_dense[0].merge(to_merge_dense[1],
+ how=how, on='A')
+
+ for column in dense_merge.columns:
+ dense_col = merged[column].to_dense()
+ tm.assert_series_equal(dense_col, dense_merge[column],
+ check_dtype=False)
+
+ to_merge = to_merge[::-1]
+ to_merge_dense = to_merge_dense[::-1]
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -222,27 +222,34 @@ class Unknown:
  '"Unknown" for data argument'):
  SparseDataFrame(Unknown())
 
- def test_constructor_preserve_attr(self):
+ # Cannot use None as a fill_value cause it will overwrite as zeros
+ @pytest.mark.parametrize('fill_value', [0, 1, np.nan])
+ def test_constructor_preserve_attr(self, fill_value):
  # GH 13866
- arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
+ arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64,
+ fill_value=fill_value)
+
  assert arr.dtype == np.int64
- assert arr.fill_value == 0
+ tm.assert_almost_equal(arr.fill_value, fill_value)
 
  df = pd.SparseDataFrame({'x': arr})
  assert df['x'].dtype == np.int64
- assert df['x'].fill_value == 0
+
+ tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
  s = pd.SparseSeries(arr, name='x')
  assert s.dtype == np.int64
- assert s.fill_value == 0
+ tm.assert_almost_equal(s.fill_value, fill_value)
 
  df = pd.SparseDataFrame(s)
  assert df['x'].dtype == np.int64
- assert df['x'].fill_value == 0
+
+ tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
  df = pd.SparseDataFrame({'x': s})
  assert df['x'].dtype == np.int64
- assert df['x'].fill_value == 0
+
+ tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
  def test_constructor_nan_dataframe(self):
  # GH 10079

diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py
@@ -626,10 +626,12 @@ def _check_inplace_op(iop, op):
  getattr(operator, op))
 
  def test_abs(self):
- s = SparseSeries([1, 2, -3], name='x')
- expected = SparseSeries([1, 2, 3], name='x')
+ s = SparseSeries([-1, -2, -3, None, np.nan], name='x')
+ expected = SparseSeries([1, 2, 3, None, np.nan], name='x')
  result = s.abs()
  tm.assert_sp_series_equal(result, expected)
+ assert result.npoints == expected.npoints
+ assert result.npoints == len(result.sp_values)
  assert result.name == 'x'
 
  result = abs(s)
@@ -641,9 +643,9 @@ def test_abs(self):
  assert result.name == 'x'
 
  s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x')
- expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index,
- fill_value=2, name='x')
+ expected = SparseSeries([1, 2, 2, 3], fill_value=2, name='x')
  result = s.abs()
+
  tm.assert_sp_series_equal(result, expected)
  assert result.name == 'x'