Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
bb904cb
ENH: add BooleanArray extension array (#29555)
jorisvandenbossche Nov 25, 2019
13c7ea3
move
TomAugspurger Nov 26, 2019
fff786f
doc fixup
TomAugspurger Nov 26, 2019
4067e7f
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Nov 26, 2019
708c553
working
TomAugspurger Nov 26, 2019
c56894e
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Nov 27, 2019
2e9d547
updates
TomAugspurger Nov 27, 2019
373aaab
updates
TomAugspurger Nov 27, 2019
7f78a64
Raise for NaN
TomAugspurger Nov 27, 2019
36b171b
added tests for empty
TomAugspurger Nov 27, 2019
747e046
added tests for inplace mutation
TomAugspurger Nov 27, 2019
d0a8cca
Do not assume masked values are False
TomAugspurger Nov 27, 2019
fe061b0
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Nov 27, 2019
9f9e44c
mypy
TomAugspurger Nov 27, 2019
0a34257
doc fixups
TomAugspurger Nov 27, 2019
2ba0034
Added benchmarks
TomAugspurger Nov 27, 2019
2d1129a
update tests
TomAugspurger Nov 27, 2019
a24fc22
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Nov 27, 2019
77dd1fc
remove unneded setitem
TomAugspurger Nov 27, 2019
7b9002c
optimize
TomAugspurger Nov 27, 2019
c18046b
comments
TomAugspurger Nov 27, 2019
1237caa
just do the xor
TomAugspurger Nov 27, 2019
2ecf9b8
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Dec 2, 2019
87aeb09
fixup docstring
TomAugspurger Dec 2, 2019
969b6dc
fix label
TomAugspurger Dec 2, 2019
1c9ba49
PERF: faster or
TomAugspurger Dec 2, 2019
8eec954
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Dec 4, 2019
cb47b6a
handle pd.NA
TomAugspurger Dec 4, 2019
2a946b9
validate
TomAugspurger Dec 4, 2019
efb6f8b
please mypy
TomAugspurger Dec 4, 2019
004238e
move to nanops
TomAugspurger Dec 4, 2019
5a2c81c
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Dec 5, 2019
7032318
move
TomAugspurger Dec 5, 2019
bbb7f9b
numpy scalars
TomAugspurger Dec 5, 2019
ce763b4
doc note
TomAugspurger Dec 5, 2019
5bc5328
handle numpy bool
TomAugspurger Dec 5, 2019
457bd08
Merge remote-tracking branch 'upstream/master' into boolean-array-kleene
TomAugspurger Dec 6, 2019
31c2bc6
cleanup
TomAugspurger Dec 6, 2019
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
working
  • Loading branch information
TomAugspurger committed Nov 26, 2019
commit 708c553078ac450ef457780c32207f236c0bfec9
3 changes: 2 additions & 1 deletion doc/source/user_guide/boolean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ Kleene Logic
:class:`arrays.BooleanArray` implements Kleene logic (sometimes called three-value logic) for
logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or).

Here's a table for ``and``.
This table demonstrates the results for every combination. These operations are symmetrical,
so flipping the left- and right-hand side makes no difference in the result.

================= =========
Expression Result
Expand Down
129 changes: 94 additions & 35 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,13 +562,13 @@ def logical_method(self, other):
# Rely on pandas to unbox and dispatch to us.
return NotImplemented

assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
other = lib.item_from_zerodim(other)
omask = mask = None
other_is_booleanarray = isinstance(other, BooleanArray)
mask = None

if other_is_booleanarray:
other, omask = other._data, other._mask
mask = omask
other, mask = other._data, other._mask
elif is_list_like(other):
other = np.asarray(other, dtype="bool")
if other.ndim > 1:
Expand All @@ -579,41 +579,15 @@ def logical_method(self, other):
raise ValueError("Lengths must match to compare")
other, mask = coerce_to_array(other, copy=False)

# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
if lib.is_scalar(other) and np.isnan(
other
): # TODO(NA): change to libmissing.NA:
result = self._data
mask = True
else:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask
else:
mask = self._mask | mask

# Kleene-logic adjustments to the mask.
if op.__name__ in {"or_", "ror_"}:
mask[result] = False
result, mask = kleene_or(self._data, other, self._mask, mask)
return BooleanArray(result, mask)
elif op.__name__ in {"and_", "rand_"}:
mask[~self._data & ~self._mask] = False
if other_is_booleanarray:
mask[~other & ~omask] = False
elif lib.is_scalar(other) and np.isnan(other): # TODO(NA): change to NA
mask[:] = True
# Do we ever assume that masked values are False?
result[mask] = False
result, mask = kleene_and(self._data, other, self._mask, mask)
return BooleanArray(result, mask)
elif op.__name__ in {"xor", "rxor"}:
# Do we ever assume that masked values are False?
result[mask] = False

return BooleanArray(result, mask)
result, mask = kleene_xor(self._data, other, self._mask, mask)
return BooleanArray(result, mask)

name = "__{name}__".format(name=op.__name__)
return set_function_name(logical_method, name, cls)
Expand Down Expand Up @@ -766,6 +740,91 @@ def boolean_arithmetic_method(self, other):
return set_function_name(boolean_arithmetic_method, name, cls)


def kleene_or(left, right, left_mask, right_mask):
if left_mask is None:
return kleene_or(right, left, right_mask, left_mask)

assert left_mask is not None
assert isinstance(left, np.ndarray)
assert isinstance(left_mask, np.ndarray)

mask = left_mask

if right_mask is not None:
mask = mask | right_mask
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this still needed with the new code below to create the mask?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe so, though I may be wrong...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry, not needed after using your code. Thanks.

else:
mask = mask.copy()

# handle scalars:
if lib.is_scalar(right) and np.isnan(right):
result = left.copy()
mask = left_mask.copy()
mask[~result] = True
return result, mask

# XXX: this implicitly relies on masked values being False!
result = left | right
mask[result] = False

# update
return result, mask


def kleene_xor(left, right, left_mask, right_mask):
if left_mask is None:
return kleene_xor(right, left, right_mask, left_mask)

result, mask = kleene_or(left, right, left_mask, right_mask)
#
# if lib.is_scalar(right):
# if right is True:
# result[result] = False
# result[left & right] = False

if lib.is_scalar(right) and right is np.nan:
mask[result] = True
else:
# assumes masked values are False
result[left & right] = False
mask[right & left_mask] = True
if right_mask is not None:
mask[left & right_mask] = True

result[mask] = False
return result, mask


def kleene_and(left, right, left_mask, right_mask):
if left_mask is None:
return kleene_and(right, left, right_mask, left_mask)

mask = left_mask

if right_mask is not None:
mask = mask | right_mask
else:
mask = mask.copy()

if lib.is_scalar(right):
result = left.copy()
mask = left_mask.copy()
if np.isnan(right):
mask[result] = True
else:
result = result & right # already copied.
if right is False:
# unmask everything
mask[:] = False
else:
result = left & right
# unmask where either left or right is False
mask[~left & ~left_mask] = False
mask[~right & ~right_mask] = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here, I think that something like this can be faster:

left_false = ~left & ~left_mask right_false= ~right & ~right_mask mask = (left_mask & ~right_false) | (right_mask & ~left_false) 

(avoiding setitem)

And need to think if we can avoid some ~

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Further optimization:

left_false = ~(left | left_mask) right_false= ~(right | right_mask) mask = (left_mask & ~right_false) | (right_mask & ~left_false) 

Timing comparison:

left = np.random.randint(0, 2, 1000).astype(bool) right = np.random.randint(0, 2, 1000).astype(bool) left_mask = np.random.randint(0, 2, 1000).astype(bool) right_mask = np.random.randint(0, 2, 1000).astype(bool) 
In [47]: %%timeit ...: mask = left_mask | right_mask ...: mask[~left & ~left_mask] = False ...: mask[~right & ~right_mask] = False 7.2 µs ± 106 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) In [58]: %%timeit ...: left_false = ~(left | left_mask) ...: right_false= ~(right | right_mask) ...: ...: mask = (left_mask & ~right_false) | (right_mask & ~left_false) 3.73 µs ± 275 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) 
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And on bigger arrays, the difference is much bigger, it seems. For 100_000 elements, I get 775 µs vs 45 µs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I'll also add an asv for these ops.


result[mask] = False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to do this (we can't rely on that in general anyway, and then this just gives a performance degradation)

return result, mask


BooleanArray._add_logical_ops()
BooleanArray._add_comparison_ops()
BooleanArray._add_arithmetic_ops()
49 changes: 37 additions & 12 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,13 +413,22 @@ def test_kleene_or(self):
result = b | a
tm.assert_extension_array_equal(result, expected)

def test_kleene_or_scalar(self):
@pytest.mark.parametrize(
"other, expected",
[
(np.nan, [True, None, None]),
(True, [True, True, True]),
(False, [True, False, None]),
],
)
def test_kleene_or_scalar(self, other, expected):
# TODO: test True & False
a = pd.array([True, False, None], dtype="boolean")
result = a | np.nan # TODO: pd.NA
expected = pd.array([True, None, None], dtype="boolean")
result = a | other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan | a # TODO: pd.NA
result = other | a
tm.assert_extension_array_equal(result, expected)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -456,13 +465,21 @@ def test_kleene_and(self):
result = b & a
tm.assert_extension_array_equal(result, expected)

def test_kleene_and_scalar(self):
@pytest.mark.parametrize(
"other, expected",
[
(np.nan, [None, False, None]),
(True, [True, False, None]),
(False, [False, False, False]),
],
)
def test_kleene_and_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a & np.nan # TODO: pd.NA
expected = pd.array([None, None, None], dtype="boolean")
result = a & other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan & a # TODO: pd.na
result = other & a
tm.assert_extension_array_equal(result, expected)

def test_kleene_xor(self):
Expand All @@ -477,13 +494,21 @@ def test_kleene_xor(self):
result = b ^ a
tm.assert_extension_array_equal(result, expected)

def test_kleene_scalar(self):
@pytest.mark.parametrize(
"other, expected",
[
(np.nan, [None, None, None]),
(True, [False, True, None]),
(False, [True, False, None]),
],
)
def test_kleene_xor_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ np.nan # TODO: pd.NA
expected = pd.array([None, None, None], dtype="boolean")
result = a ^ other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)

result = np.nan ^ a # TODO: pd.NA
result = other ^ a
tm.assert_extension_array_equal(result, expected)


Expand Down