googleapis
diff --git a/‎README.rst‎
Lines changed: 1 addition & 0 deletions b/‎README.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bigframes/dtypes.py‎
Lines changed: 10 additions & 6 deletions b/‎bigframes/dtypes.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎bigframes/session/__init__.py‎
Lines changed: 17 additions & 35 deletions b/‎bigframes/session/__init__.py‎
Lines changed: 17 additions & 35 deletions
diff --git a/‎bigframes/session/_io/bigquery.py‎
Lines changed: 54 additions & 0 deletions b/‎bigframes/session/_io/bigquery.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tests/system/small/test_dataframe.py‎
Lines changed: 109 additions & 17 deletions b/‎tests/system/small/test_dataframe.py‎
Lines changed: 109 additions & 17 deletions
diff --git a/‎tests/unit/session/test_session.py‎
Lines changed: 5 additions & 2 deletions b/‎tests/unit/session/test_session.py‎
Lines changed: 5 additions & 2 deletions
@@ -25,6 +25,7 @@ Documentation
 * `BigQuery DataFrames source code (GitHub) <https://github.com/googleapis/python-bigquery-dataframes>`_
 * `BigQuery DataFrames sample notebooks <https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks>`_
 * `BigQuery DataFrames API reference <https://cloud.google.com/python/docs/reference/bigframes/latest/summary_overview>`_
+* `BigQuery DataFrames supported pandas APIs <https://cloud.google.com/python/docs/reference/bigframes/latest/supported_pandas_apis>`_
 
 
 Getting started with BigQuery DataFrames
 
@@ -658,10 +658,14 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:
  return None
 
 
-def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
- """Get the supertype of the two types."""
- if dtype1 == dtype2:
- return dtype1
+def lcd_type(*dtypes: Dtype) -> Dtype:
+ if len(dtypes) < 1:
+ raise ValueError("at least one dypes should be provided")
+ if len(dtypes) == 1:
+ return dtypes[0]
+ unique_dtypes = set(dtypes)
+ if len(unique_dtypes) == 1:
+ return unique_dtypes.pop()
  # Implicit conversion currently only supported for numeric types
  hierarchy: list[Dtype] = [
  pd.BooleanDtype(),
@@ -670,9 +674,9 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
  pd.ArrowDtype(pa.decimal256(76, 38)),
  pd.Float64Dtype(),
  ]
- if (dtype1 not in hierarchy) or (dtype2 not in hierarchy):
+ if any([dtype not in hierarchy for dtype in dtypes]):
  return None
- lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2))
+ lcd_index = max([hierarchy.index(dtype) for dtype in dtypes])
  return hierarchy[lcd_index]
 
 
 
@@ -232,7 +232,9 @@ def __init__(
  # Now that we're starting the session, don't allow the options to be
  # changed.
  context._session_started = True
- self._df_snapshot: Dict[bigquery.TableReference, datetime.datetime] = {}
+ self._df_snapshot: Dict[
+ bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]
+ ] = {}
 
  @property
  def bqclient(self):
@@ -699,14 +701,25 @@ def _get_snapshot_sql_and_primary_key(
  column(s), then return those too so that ordering generation can be
  avoided.
  """
- # If there are primary keys defined, the query engine assumes these
- # columns are unique, even if the constraint is not enforced. We make
- # the same assumption and use these columns as the total ordering keys.
+ (
+ snapshot_timestamp,
+ table,
+ ) = bigframes_io.get_snapshot_datetime_and_table_metadata(
+ self.bqclient,
+ table_ref=table_ref,
+ api_name=api_name,
+ cache=self._df_snapshot,
+ use_cache=use_cache,
+ )
+
  if table.location.casefold() != self._location.casefold():
  raise ValueError(
  f"Current session is in {self._location} but dataset '{table.project}.{table.dataset_id}' is located in {table.location}"
  )
 
+ # If there are primary keys defined, the query engine assumes these
+ # columns are unique, even if the constraint is not enforced. We make
+ # the same assumption and use these columns as the total ordering keys.
  primary_keys = None
  if (
  (table_constraints := getattr(table, "table_constraints", None)) is not None
@@ -717,37 +730,6 @@ def _get_snapshot_sql_and_primary_key(
  ):
  primary_keys = columns
 
- job_config = bigquery.QueryJobConfig()
- job_config.labels["bigframes-api"] = api_name
- if use_cache and table.reference in self._df_snapshot.keys():
- snapshot_timestamp = self._df_snapshot[table.reference]
-
- # Cache hit could be unexpected. See internal issue 329545805.
- # Raise a warning with more information about how to avoid the
- # problems with the cache.
- warnings.warn(
- f"Reading cached table from {snapshot_timestamp} to avoid "
- "incompatibilies with previous reads of this table. To read "
- "the latest version, set `use_cache=False` or close the "
- "current session with Session.close() or "
- "bigframes.pandas.close_session().",
- # There are many layers before we get to (possibly) the user's code:
- # pandas.read_gbq_table
- # -> with_default_session
- # -> Session.read_gbq_table
- # -> _read_gbq_table
- # -> _get_snapshot_sql_and_primary_key
- stacklevel=6,
- )
- else:
- snapshot_timestamp = list(
- self.bqclient.query(
- "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
- job_config=job_config,
- ).result()
- )[0][0]
- self._df_snapshot[table.reference] = snapshot_timestamp
-
  try:
  table_expression = self.ibis_client.sql(
  bigframes_io.create_snapshot_sql(table.reference, snapshot_timestamp)
 
@@ -23,6 +23,7 @@
 import types
 from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
 import uuid
+import warnings
 
 import google.api_core.exceptions
 import google.cloud.bigquery as bigquery
@@ -121,6 +122,59 @@ def table_ref_to_sql(table: bigquery.TableReference) -> str:
  return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
 
 
+def get_snapshot_datetime_and_table_metadata(
+ bqclient: bigquery.Client,
+ table_ref: bigquery.TableReference,
+ *,
+ api_name: str,
+ cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]],
+ use_cache: bool = True,
+) -> Tuple[datetime.datetime, bigquery.Table]:
+ cached_table = cache.get(table_ref)
+ if use_cache and cached_table is not None:
+ snapshot_timestamp, _ = cached_table
+
+ # Cache hit could be unexpected. See internal issue 329545805.
+ # Raise a warning with more information about how to avoid the
+ # problems with the cache.
+ warnings.warn(
+ f"Reading cached table from {snapshot_timestamp} to avoid "
+ "incompatibilies with previous reads of this table. To read "
+ "the latest version, set `use_cache=False` or close the "
+ "current session with Session.close() or "
+ "bigframes.pandas.close_session().",
+ # There are many layers before we get to (possibly) the user's code:
+ # pandas.read_gbq_table
+ # -> with_default_session
+ # -> Session.read_gbq_table
+ # -> _read_gbq_table
+ # -> _get_snapshot_sql_and_primary_key
+ # -> get_snapshot_datetime_and_table_metadata
+ stacklevel=7,
+ )
+ return cached_table
+
+ # TODO(swast): It's possible that the table metadata is changed between now
+ # and when we run the CURRENT_TIMESTAMP() query to see when we can time
+ # travel to. Find a way to fetch the table metadata and BQ's current time
+ # atomically.
+ table = bqclient.get_table(table_ref)
+
+ # TODO(b/336521938): Refactor to make sure we set the "bigframes-api"
+ # whereever we execute a query.
+ job_config = bigquery.QueryJobConfig()
+ job_config.labels["bigframes-api"] = api_name
+ snapshot_timestamp = list(
+ bqclient.query(
+ "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
+ job_config=job_config,
+ ).result()
+ )[0][0]
+ cached_table = (snapshot_timestamp, table)
+ cache[table_ref] = cached_table
+ return cached_table
+
+
 def create_snapshot_sql(
  table_ref: bigquery.TableReference, current_timestamp: datetime.datetime
 ) -> str:
 
@@ -2390,12 +2390,27 @@ def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods
 def test_dataframe_agg_single_string(scalars_dfs):
  numeric_cols = ["int64_col", "int64_too", "float64_col"]
  scalars_df, scalars_pandas_df = scalars_dfs
+
  bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
  pd_result = scalars_pandas_df[numeric_cols].agg("sum")
 
- # Pandas may produce narrower numeric types, but bigframes always produces Float64
- pd_result = pd_result.astype("Float64")
- pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+ assert bf_result.dtype == "Float64"
+ pd.testing.assert_series_equal(
+ pd_result, bf_result, check_dtype=False, check_index_type=False
+ )
+
+
+def test_dataframe_agg_int_single_string(scalars_dfs):
+ numeric_cols = ["int64_col", "int64_too", "bool_col"]
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = scalars_df[numeric_cols].agg("sum").to_pandas()
+ pd_result = scalars_pandas_df[numeric_cols].agg("sum")
+
+ assert bf_result.dtype == "Int64"
+ pd.testing.assert_series_equal(
+ pd_result, bf_result, check_dtype=False, check_index_type=False
+ )
 
 
 def test_dataframe_agg_multi_string(scalars_dfs):
@@ -2431,6 +2446,27 @@ def test_dataframe_agg_multi_string(scalars_dfs):
  ).all()
 
 
+def test_dataframe_agg_int_multi_string(scalars_dfs):
+ numeric_cols = ["int64_col", "int64_too", "bool_col"]
+ aggregations = [
+ "sum",
+ "nunique",
+ "count",
+ ]
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas()
+ pd_result = scalars_pandas_df[numeric_cols].agg(aggregations)
+
+ for dtype in bf_result.dtypes:
+ assert dtype == "Int64"
+
+ # Pandas may produce narrower numeric types
+ # Pandas has object index type
+ pd.testing.assert_frame_equal(
+ pd_result, bf_result, check_dtype=False, check_index_type=False
+ )
+
+
 @skip_legacy_pandas
 def test_df_describe(scalars_dfs):
  scalars_df, scalars_pandas_df = scalars_dfs
@@ -2982,6 +3018,58 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
  pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99
 
 
+@pytest.mark.parametrize(
+ ("col", "op"),
+ [
+ # Int aggregates
+ pytest.param("int64_col", lambda x: x.sum(), id="int-sum"),
+ pytest.param("int64_col", lambda x: x.min(), id="int-min"),
+ pytest.param("int64_col", lambda x: x.max(), id="int-max"),
+ pytest.param("int64_col", lambda x: x.count(), id="int-count"),
+ pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"),
+ # Float aggregates
+ pytest.param("float64_col", lambda x: x.count(), id="float-count"),
+ pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"),
+ # Bool aggregates
+ pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"),
+ pytest.param("bool_col", lambda x: x.count(), id="bool-count"),
+ pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"),
+ # String aggregates
+ pytest.param("string_col", lambda x: x.count(), id="string-count"),
+ pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"),
+ ],
+)
+def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op):
+ bf_result = op(scalars_df_index[[col]]).to_pandas()
+ pd_result = op(scalars_pandas_df_index[[col]])
+
+ # Check dtype separately
+ assert bf_result.dtype == "Int64"
+
+ # Pandas may produce narrower numeric types
+ # Pandas has object index type
+ assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+
+
+@pytest.mark.parametrize(
+ ("col", "op"),
+ [
+ pytest.param("bool_col", lambda x: x.min(), id="bool-min"),
+ pytest.param("bool_col", lambda x: x.max(), id="bool-max"),
+ ],
+)
+def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op):
+ bf_result = op(scalars_df_index[[col]]).to_pandas()
+ pd_result = op(scalars_pandas_df_index[[col]])
+
+ # Check dtype separately
+ assert bf_result.dtype == "boolean"
+
+ # Pandas may produce narrower numeric types
+ # Pandas has object index type
+ assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+
+
 @pytest.mark.parametrize(
  ("ordered"),
  [
@@ -2990,34 +3078,38 @@ def test_loc_setitem_bool_series_scalar_error(scalars_dfs):
  ],
 )
 @pytest.mark.parametrize(
- ("op"),
+ ("op", "bf_dtype"),
  [
- (lambda x: x.sum(numeric_only=True)),
- (lambda x: x.mean(numeric_only=True)),
- (lambda x: x.min(numeric_only=True)),
- (lambda x: x.max(numeric_only=True)),
- (lambda x: x.std(numeric_only=True)),
- (lambda x: x.var(numeric_only=True)),
- (lambda x: x.count(numeric_only=False)),
- (lambda x: x.nunique()),
+ (lambda x: x.sum(numeric_only=True), "Float64"),
+ (lambda x: x.mean(numeric_only=True), "Float64"),
+ (lambda x: x.min(numeric_only=True), "Float64"),
+ (lambda x: x.max(numeric_only=True), "Float64"),
+ (lambda x: x.std(numeric_only=True), "Float64"),
+ (lambda x: x.var(numeric_only=True), "Float64"),
+ (lambda x: x.count(numeric_only=False), "Int64"),
+ (lambda x: x.nunique(), "Int64"),
  ],
  ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"],
 )
-def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op, ordered):
+def test_dataframe_aggregates(
+ scalars_df_index, scalars_pandas_df_index, op, bf_dtype, ordered
+):
  col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"]
  bf_series = op(scalars_df_index[col_names])
- pd_series = op(scalars_pandas_df_index[col_names])
  bf_result = bf_series.to_pandas(ordered=ordered)
+ pd_result = op(scalars_pandas_df_index[col_names])
+
+ # Check dtype separately
+ assert bf_result.dtype == bf_dtype
 
  # Pandas may produce narrower numeric types, but bigframes always produces Float64
  # Pandas has object index type
- pd_series.index = pd_series.index.astype(pd.StringDtype(storage="pyarrow"))
  assert_series_equal(
- pd_series,
+ pd_result,
  bf_result,
+ check_dtype=False,
  check_index_type=False,
  ignore_order=not ordered,
- check_dtype=False,
  )
 
 
 
@@ -42,8 +42,11 @@ def test_read_gbq_cached_table():
  google.cloud.bigquery.DatasetReference("my-project", "my_dataset"),
  "my_table",
  )
- session._df_snapshot[table_ref] = datetime.datetime(
- 1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc
+ table = google.cloud.bigquery.Table(table_ref)
+ table._properties["location"] = session._location
+ session._df_snapshot[table_ref] = (
+ datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc),
+ table,
  )
 
  def get_table_mock(table_ref):
Original file line number	Diff line number	Diff line change
`@@ -42,8 +42,11 @@ def test_read_gbq_cached_table():`
`42`	`42`	`google.cloud.bigquery.DatasetReference("my-project", "my_dataset"),`
`43`	`43`	`"my_table",`
`44`	`44`	`)`
`45`		`- session._df_snapshot[table_ref] = datetime.datetime(`
`46`		`- 1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc`
	`45`	`+ table = google.cloud.bigquery.Table(table_ref)`
	`46`	`+ table._properties["location"] = session._location`
	`47`	`+ session._df_snapshot[table_ref] = (`
	`48`	`+ datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc),`
	`49`	`+ table,`
`47`	`50`	`)`
`48`	`51`
`49`	`52`	`def get_table_mock(table_ref):`