Skip to content
Prev Previous commit
Next Next commit
ENH catch the int32 overflow error earlier and in two separate places…
…: in pivot_table and unstack
  • Loading branch information
anhqle committed Jul 30, 2018
commit 01a79439e4ede04d4e03aeb812e7c72035a17e1d
5 changes: 5 additions & 0 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
index = _convert_by(index)
columns = _convert_by(columns)

num_rows = data.reindex(index, axis='columns').shape[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you are doing extra work here (e.g. the reindex), can we not do the calculation directly?

num_columns = data.reindex(columns, axis='columns').shape[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also this is error is now in 2 places

if num_rows * num_columns > (2 ** 31 - 1):
raise ValueError('Pivot table is too big, causing int32 overflow')

if isinstance(aggfunc, list):
pieces = []
keys = []
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def __init__(self, values, index, level=-1, value_columns=None,
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]

num_rows = np.max([index_level.size for index_level in self.new_index_levels])
num_columns = self.removed_level.size
if num_rows * num_columns > (2 ** 31 - 1):
raise ValueError('Unstacked data frame is too big, causing int32 overflow')

self._make_sorted_values_labels()
self._make_selectors()

Expand Down Expand Up @@ -161,8 +166,6 @@ def _make_selectors(self):
self.full_shape = ngroups, stride

selector = self.sorted_labels[-1] + stride * comp_index + self.lift
if np.prod(self.full_shape) > (2 ** 31 - 1):
raise ValueError('Pivot table is too big, causing int32 overflow')
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
mask.put(selector, True)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1278,11 +1278,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
@pytest.mark.slow
def test_pivot_number_of_levels_larger_than_int32(self):
# GH 20601
data = DataFrame({'ind1': list(range(1337600)) * 2,
'ind2': list(range(3040)) * 2 * 440,
'count': [1] * 2 * 1337600})
df = DataFrame({'ind1': np.arange(2 ** 16),
'ind2': np.arange(2 ** 16),
'count': np.arange(2 ** 16)})
with tm.assert_raises_regex(ValueError, 'int32 overflow'):
data.pivot_table(index='ind1', columns='ind2',
df.pivot_table(index='ind1', columns='ind2',
values='count', aggfunc='count')


Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,6 +1195,13 @@ def test_unstack_unobserved_keys(self):
recons = result.stack()
tm.assert_frame_equal(recons, df)

@pytest.mark.slow
def test_unstack_number_of_levels_larger_than_int32(self):
# GH 20601
df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)])
with tm.assert_raises_regex(ValueError, 'int32 overflow'):
df.unstack()

def test_stack_order_with_unsorted_levels(self):
# GH 16323

Expand Down