Skip to content
Merged
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,17 @@ def time_take(self, indices, allow_fill):
self.sp_arr.take(indices, allow_fill=allow_fill)


class GetItem:
def setup(self):
N = 1_000_000
arr = make_array(N, 1e-5, np.nan, np.float64)
self.sp_arr = SparseArray(arr)

def time_integer_indexing(self):
self.sp_arr[78]

def time_slice(self):
self.sp_arr[1:]


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,10 @@ Performance improvements
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
- Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
-

.. ---------------------------------------------------------------------------

Expand Down
44 changes: 35 additions & 9 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,13 +892,39 @@ def __getitem__(
elif isinstance(key, tuple):
data_slice = self.to_dense()[key]
elif isinstance(key, slice):
# special case to preserve dtypes
if key == slice(None):
return self.copy()
# TODO: this logic is surely elsewhere
# TODO: this could be more efficient
indices = np.arange(len(self), dtype=np.int32)[key]
return self.take(indices)

# Avoid densifying when handling contiguous slices
if key.step is None or key.step == 1:
start = 0 if key.start is None else key.start
if start < 0:
start += len(self)

end = len(self) if key.stop is None else key.stop
if end < 0:
end += len(self)

indices = self.sp_index.to_int_index().indices
keep_inds = np.flatnonzero((indices >= start) & (indices < end))
sp_vals = self.sp_values[keep_inds]

sp_index = indices[keep_inds].copy()

# If we've sliced to not include the start of the array, all our indices
# should be shifted. NB: here we are careful to also not shift by a
# negative value for a case like [0, 1][-100:] where the start index
# should be treated like 0
if start > 0:
sp_index -= start

# Length of our result should match applying this slice to a range
# of the length of our original array
new_len = len(range(len(self))[key])
new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
else:
indices = np.arange(len(self), dtype=np.int32)[key]
return self.take(indices)

else:
# TODO: I think we can avoid densifying when masking a
# boolean SparseArray with another. Need to look at the
Expand Down Expand Up @@ -1745,10 +1771,10 @@ def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntInde

def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
index: SparseIndex
if kind == "block" or isinstance(kind, BlockIndex):
if kind == "block":
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
elif kind == "integer" or isinstance(kind, IntIndex):
elif kind == "integer":
index = IntIndex(length, indices)
else: # pragma: no cover
raise ValueError("must be block or integer type")
Expand Down
48 changes: 31 additions & 17 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,23 +679,37 @@ def test_getitem_arraylike_mask(self):
expected = SparseArray([0, 2])
tm.assert_sp_array_equal(result, expected)

def test_getslice(self):
result = self.arr[:-3]
exp = SparseArray(self.arr.to_dense()[:-3])
tm.assert_sp_array_equal(result, exp)

result = self.arr[-4:]
exp = SparseArray(self.arr.to_dense()[-4:])
tm.assert_sp_array_equal(result, exp)

# two corner cases from Series
result = self.arr[-12:]
exp = SparseArray(self.arr)
tm.assert_sp_array_equal(result, exp)

result = self.arr[:-12]
exp = SparseArray(self.arr.to_dense()[:0])
tm.assert_sp_array_equal(result, exp)
@pytest.mark.parametrize(
"slc",
[
np.s_[:],
np.s_[1:10],
np.s_[1:100],
np.s_[10:1],
np.s_[:-3],
np.s_[-5:-4],
np.s_[:-12],
np.s_[-12:],
np.s_[2:],
np.s_[2::3],
np.s_[::2],
np.s_[::-1],
np.s_[::-2],
np.s_[1:6:2],
np.s_[:-6:-2],
],
)
@pytest.mark.parametrize(
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
)
def test_getslice(self, slc, as_dense):
as_dense = np.array(as_dense)
arr = SparseArray(as_dense)

result = arr[slc]
expected = SparseArray(as_dense[slc])

tm.assert_sp_array_equal(result, expected)

def test_getslice_tuple(self):
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
Expand Down