googleapis
diff --git a/‎bigframes/core/blocks.py‎
Lines changed: 29 additions & 0 deletions b/‎bigframes/core/blocks.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎bigframes/core/compile/aggregate_compiler.py‎
Lines changed: 18 additions & 1 deletion b/‎bigframes/core/compile/aggregate_compiler.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎bigframes/core/expression.py‎
Lines changed: 5 additions & 0 deletions b/‎bigframes/core/expression.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎bigframes/core/groupby/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎bigframes/core/groupby/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎bigframes/operations/aggregations.py‎
Lines changed: 20 additions & 0 deletions b/‎bigframes/operations/aggregations.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎tests/system/small/test_groupby.py‎
Lines changed: 80 additions & 23 deletions b/‎tests/system/small/test_groupby.py‎
Lines changed: 80 additions & 23 deletions
diff --git a/‎third_party/bigframes_vendored/ibis/expr/operations/analytic.py‎
Lines changed: 8 additions & 0 deletions b/‎third_party/bigframes_vendored/ibis/expr/operations/analytic.py‎
Lines changed: 8 additions & 0 deletions
@@ -933,6 +933,35 @@ def aggregate_all_and_stack(
  index_labels=self.index.names,
  )
 
+ def aggregate_size(
+ self,
+ by_column_ids: typing.Sequence[str] = (),
+ *,
+ dropna: bool = True,
+ ):
+ """Returns a block object to compute the size(s) of groups."""
+ agg_specs = [
+ (ex.NullaryAggregation(agg_ops.SizeOp()), guid.generate_guid()),
+ ]
+ output_col_ids = [agg_spec[1] for agg_spec in agg_specs]
+ result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna)
+ aggregate_labels = self._get_labels_for_columns(["size"])
+ names: typing.List[Label] = []
+ for by_col_id in by_column_ids:
+ if by_col_id in self.value_columns:
+ names.append(self.col_id_to_label[by_col_id])
+ else:
+ names.append(self.col_id_to_index_name[by_col_id])
+ return (
+ Block(
+ result_expr,
+ index_columns=by_column_ids,
+ column_labels=aggregate_labels,
+ index_labels=names,
+ ),
+ output_col_ids,
+ )
+
  def select_column(self, id: str) -> Block:
  return self.select_columns([id])
 
 
@@ -35,6 +35,8 @@ def compile_aggregate(
  aggregate: ex.Aggregation,
  bindings: typing.Dict[str, ibis_types.Value],
 ) -> ibis_types.Value:
+ if isinstance(aggregate, ex.NullaryAggregation):
+ return compile_nullary_agg(aggregate.op)
  if isinstance(aggregate, ex.UnaryAggregation):
  input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
  return compile_unary_agg(
@@ -54,7 +56,9 @@ def compile_analytic(
  window: window_spec.WindowSpec,
  bindings: typing.Dict[str, ibis_types.Value],
 ) -> ibis_types.Value:
- if isinstance(aggregate, ex.UnaryAggregation):
+ if isinstance(aggregate, ex.NullaryAggregation):
+ return compile_nullary_agg(aggregate.op, window)
+ elif isinstance(aggregate, ex.UnaryAggregation):
  input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
  return compile_unary_agg(aggregate.op, input, window)
  elif isinstance(aggregate, ex.BinaryAggregation):
@@ -81,6 +85,14 @@ def compile_unary_agg(
  raise ValueError(f"Can't compile unrecognized operation: {op}")
 
 
+@functools.singledispatch
+def compile_nullary_agg(
+ op: agg_ops.WindowOp,
+ window: Optional[window_spec.WindowSpec] = None,
+) -> ibis_types.Value:
+ raise ValueError(f"Can't compile unrecognized operation: {op}")
+
+
 def numeric_op(operation):
  @functools.wraps(operation)
  def constrained_op(op, column: ibis_types.Column, window=None):
@@ -101,6 +113,11 @@ def constrained_op(op, column: ibis_types.Column, window=None):
 ### Specific Op implementations Below
 
 
+@compile_nullary_agg.register
+def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue:
+ return _apply_window_if_present(vendored_ibis_ops.count(1), window)
+
+
 @compile_unary_agg.register
 @numeric_op
 def _(
 
@@ -40,6 +40,11 @@ class Aggregation(abc.ABC):
  op: agg_ops.WindowOp = dataclasses.field()
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryAggregation(Aggregation):
+ op: agg_ops.NullaryWindowOp = dataclasses.field()
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryAggregation(Aggregation):
  op: agg_ops.UnaryWindowOp = dataclasses.field()
 
@@ -102,6 +102,20 @@ def __getitem__(
  dropna=self._dropna,
  )
 
+ def size(self) -> typing.Union[df.DataFrame, series.Series]:
+ agg_block, _ = self._block.aggregate_size(
+ by_column_ids=self._by_col_ids,
+ dropna=self._dropna,
+ )
+ agg_block = agg_block.with_column_labels(pd.Index(["size"]))
+ dataframe = df.DataFrame(agg_block)
+
+ if self._as_index:
+ series = dataframe["size"]
+ return series.rename(None)
+ else:
+ return self._convert_index(dataframe)
+
  def sum(self, numeric_only: bool = False, *args) -> df.DataFrame:
  if not numeric_only:
  self._raise_on_non_numeric("sum")
@@ -475,6 +489,13 @@ def std(self, *args, **kwargs) -> series.Series:
  def var(self, *args, **kwargs) -> series.Series:
  return self._aggregate(agg_ops.var_op)
 
+ def size(self) -> series.Series:
+ agg_block, _ = self._block.aggregate_size(
+ by_column_ids=self._by_col_ids,
+ dropna=self._dropna,
+ )
+ return series.Series(agg_block, name=self._value_name)
+
  def skew(self, *args, **kwargs) -> series.Series:
  block = block_ops.skew(self._block, [self._value_column], self._by_col_ids)
  return series.Series(block)
 
@@ -33,6 +33,13 @@ def handles_ties(self):
  return False
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryWindowOp(WindowOp):
+ @property
+ def arguments(self) -> int:
+ return 0
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryWindowOp(WindowOp):
  @property
@@ -55,6 +62,13 @@ def arguments(self) -> int:
  ...
 
 
+@dataclasses.dataclass(frozen=True)
+class NullaryAggregateOp(AggregateOp, NullaryWindowOp):
+ @property
+ def arguments(self) -> int:
+ return 0
+
+
 @dataclasses.dataclass(frozen=True)
 class UnaryAggregateOp(AggregateOp, UnaryWindowOp):
  @property
@@ -69,6 +83,11 @@ def arguments(self) -> int:
  return 2
 
 
+@dataclasses.dataclass(frozen=True)
+class SizeOp(NullaryAggregateOp):
+ name: ClassVar[str] = "size"
+
+
 @dataclasses.dataclass(frozen=True)
 class SumOp(UnaryAggregateOp):
  name: ClassVar[str] = "sum"
@@ -270,6 +289,7 @@ class CovOp(BinaryAggregateOp):
  name: ClassVar[str] = "cov"
 
 
+size_op = SizeOp()
 sum_op = SumOp()
 mean_op = MeanOp()
 median_op = MedianOp()
 
@@ -19,6 +19,10 @@
 from tests.system.utils import assert_pandas_df_equal
 
 
+# =================
+# DataFrame.groupby
+# =================
+
 @pytest.mark.parametrize(
  ("operator"),
  [
@@ -250,21 +254,26 @@ def test_dataframe_groupby_analytic(
  pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
-def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
- bf_result = scalars_df_index.groupby("bool_col")["int64_too"].skew().to_pandas()
- pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].skew()
+def test_dataframe_groupby_size_as_index_false(
+ scalars_df_index, scalars_pandas_df_index
+):
+ bf_result = scalars_df_index.groupby("string_col", as_index=False).size()
+ bf_result_computed = bf_result.to_pandas()
+ pd_result = scalars_pandas_df_index.groupby("string_col", as_index=False).size()
 
- pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+ pd.testing.assert_frame_equal(
+ pd_result, bf_result_computed, check_dtype=False, check_index_type=False
+ )
 
 
-def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
- bf_result = scalars_df_index.groupby("bool_col")["int64_too"].kurt().to_pandas()
- # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
- pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
-  pd.Series.kurt
- )
+def test_dataframe_groupby_size_as_index_true(
+ scalars_df_index, scalars_pandas_df_index
+):
+ bf_result = scalars_df_index.groupby("string_col", as_index=True).size()
+ pd_result = scalars_pandas_df_index.groupby("string_col", as_index=True).size()
+ bf_result_computed = bf_result.to_pandas()
 
- pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+ pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
 def test_dataframe_groupby_skew(scalars_df_index, scalars_pandas_df_index):
@@ -337,6 +346,26 @@ def test_dataframe_groupby_getitem_list(
  pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
 
 
+def test_dataframe_groupby_nonnumeric_with_mean():
+ df = pd.DataFrame(
+ {
+ "key1": ["a", "a", "a", "b"],
+ "key2": ["a", "a", "c", "c"],
+ "key3": [1, 2, 3, 4],
+ "key4": [1.6, 2, 3, 4],
+ }
+ )
+ pd_result = df.groupby(["key1", "key2"]).mean()
+ bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
+
+ pd.testing.assert_frame_equal(
+ pd_result, bf_result, check_index_type=False, check_dtype=False
+ )
+
+# ==============
+# Series.groupby
+# ==============
+
 def test_series_groupby_agg_string(scalars_df_index, scalars_pandas_df_index):
  bf_result = (
  scalars_df_index["int64_col"]
@@ -373,18 +402,46 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
  )
 
 
-def test_dataframe_groupby_nonnumeric_with_mean():
- df = pd.DataFrame(
- {
- "key1": ["a", "a", "a", "b"],
- "key2": ["a", "a", "c", "c"],
- "key3": [1, 2, 3, 4],
- "key4": [1.6, 2, 3, 4],
- }
+def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
+ bf_result = (
+ scalars_df_index["int64_too"]
+ .groupby(scalars_df_index["bool_col"])
+ .kurt()
+ .to_pandas()
+ )
+ # Pandas doesn't have groupby.kurt yet: https://github.com/pandas-dev/pandas/issues/40139
+ pd_result = scalars_pandas_df_index.groupby("bool_col")["int64_too"].apply(
+ pd.Series.kurt
  )
- pd_result = df.groupby(["key1", "key2"]).mean()
- bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas()
 
- pd.testing.assert_frame_equal(
- pd_result, bf_result, check_index_type=False, check_dtype=False
+ pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
+def test_series_groupby_size(scalars_df_index, scalars_pandas_df_index):
+ bf_result = (
+ scalars_df_index["int64_too"].groupby(scalars_df_index["bool_col"]).size()
  )
+ pd_result = (
+ scalars_pandas_df_index["int64_too"]
+ .groupby(scalars_pandas_df_index["bool_col"])
+ .size()
+ )
+ bf_result_computed = bf_result.to_pandas()
+
+ pd.testing.assert_series_equal(pd_result, bf_result_computed, check_dtype=False)
+
+
+def test_series_groupby_skew(scalars_df_index, scalars_pandas_df_index):
+ bf_result = (
+ scalars_df_index["int64_too"]
+ .groupby(scalars_df_index["bool_col"])
+ .skew()
+ .to_pandas()
+ )
+ pd_result = (
+ scalars_pandas_df_index["int64_too"]
+ .groupby(scalars_pandas_df_index["bool_col"])
+ .skew()
+ )
+
+ pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
@@ -2,10 +2,17 @@
 
 from __future__ import annotations
 
+import ibis
 import ibis.expr.operations as ops
 import ibis.expr.rules as rlz
 
 
+@ibis.udf.agg.builtin
+def count(value: int) -> int:
+ """Count of a scalar."""
+ return 0 # pragma: NO COVER
+
+
 class FirstNonNullValue(ops.Analytic):
  """Retrieve the first element."""
 
@@ -21,6 +28,7 @@ class LastNonNullValue(ops.Analytic):
 
 
 __all__ = [
+ "count",
  "FirstNonNullValue",
  "LastNonNullValue",
 ]