ENH catch the int32 overflow error earlier and in two separate places…

…: in pivot_table and unstack
pandas-dev · anhqle · Apr 16, 2018 · Apr 16, 2018 · Apr 16, 2018 · Apr 22, 2018
commit 01a79439e4ede04d4e03aeb812e7c72035a17e1d
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -31,6 +31,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
  index = _convert_by(index)
  columns = _convert_by(columns)
 
+ num_rows = data.reindex(index, axis='columns').shape[0]
+ num_columns = data.reindex(columns, axis='columns').shape[0]
+ if num_rows * num_columns > (2 ** 31 - 1):
+ raise ValueError('Pivot table is too big, causing int32 overflow')
+
  if isinstance(aggfunc, list):
  pieces = []
  keys = []

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -126,6 +126,11 @@ def __init__(self, values, index, level=-1, value_columns=None,
  self.removed_level = self.new_index_levels.pop(self.level)
  self.removed_level_full = index.levels[self.level]
 
+ num_rows = np.max([index_level.size for index_level in self.new_index_levels])
+ num_columns = self.removed_level.size
+ if num_rows * num_columns > (2 ** 31 - 1):
+ raise ValueError('Unstacked data frame is too big, causing int32 overflow')
+
  self._make_sorted_values_labels()
  self._make_selectors()
 
@@ -161,8 +166,6 @@ def _make_selectors(self):
  self.full_shape = ngroups, stride
 
  selector = self.sorted_labels[-1] + stride * comp_index + self.lift
- if np.prod(self.full_shape) > (2 ** 31 - 1):
- raise ValueError('Pivot table is too big, causing int32 overflow')
  mask = np.zeros(np.prod(self.full_shape), dtype=bool)
  mask.put(selector, True)
 

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -1278,11 +1278,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
  @pytest.mark.slow
  def test_pivot_number_of_levels_larger_than_int32(self):
  # GH 20601
- data = DataFrame({'ind1': list(range(1337600)) * 2,
- 'ind2': list(range(3040)) * 2 * 440,
- 'count': [1] * 2 * 1337600})
+ df = DataFrame({'ind1': np.arange(2 ** 16),
+ 'ind2': np.arange(2 ** 16),
+ 'count': np.arange(2 ** 16)})
  with tm.assert_raises_regex(ValueError, 'int32 overflow'):
- data.pivot_table(index='ind1', columns='ind2',
+ df.pivot_table(index='ind1', columns='ind2',
  values='count', aggfunc='count')
 
 

diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -1195,6 +1195,13 @@ def test_unstack_unobserved_keys(self):
  recons = result.stack()
  tm.assert_frame_equal(recons, df)
 
+ @pytest.mark.slow
+ def test_unstack_number_of_levels_larger_than_int32(self):
+ # GH 20601
+ df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)])
+ with tm.assert_raises_regex(ValueError, 'int32 overflow'):
+ df.unstack()
+
  def test_stack_order_with_unsorted_levels(self):
  # GH 16323