googleapis · gcf-merge-on-green · Oct 16, 2023 · Oct 13, 2023 · Oct 13, 2023 · Oct 13, 2023
@@ -21,6 +21,7 @@
 import pandas as pd
 
 import bigframes.constants as constants
+import bigframes.core.blocks
 import bigframes.core.guid as guid
 import bigframes.core.indexes as indexes
 import bigframes.core.scalar
@@ -214,7 +215,7 @@ def __getitem__(self, key: tuple) -> bigframes.core.scalar.Scalar:
  raise ValueError(error_message)
  if len(key) != 2:
  raise TypeError(error_message)
- block = self._dataframe._block
+ block: bigframes.core.blocks.Block = self._dataframe._block
  column_block = block.select_columns([block.value_columns[key[1]]])
  column = bigframes.series.Series(column_block)
  return column.iloc[key[0]]

@@ -498,6 +498,8 @@ def read_gbq_query(
 
  See also: :meth:`Session.read_gbq`.
  """
+ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
+ # these docstrings are inline.
  return self._read_gbq_query(
  query=query,
  index_col=index_col,
@@ -515,8 +517,6 @@ def _read_gbq_query(
  max_results: Optional[int] = None,
  api_name: str,
  ) -> dataframe.DataFrame:
- # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
- # these docstrings are inline.
  if isinstance(index_col, str):
  index_cols = [index_col]
  else:
@@ -561,6 +561,8 @@ def read_gbq_table(
 
  See also: :meth:`Session.read_gbq`.
  """
+ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
+ # these docstrings are inline.
  return self._read_gbq_table(
  query=query,
  index_col=index_col,
@@ -569,6 +571,62 @@ def read_gbq_table(
  api_name="read_gbq_table",
  )
 
+ def _read_gbq_table_to_ibis_with_total_ordering(
+ self,
+ table_ref: bigquery.table.TableReference,
+ *,
+ api_name: str,
+ ) -> Tuple[ibis_types.Table, Optional[Sequence[str]]]:
+ """Create a read-only Ibis table expression representing a table.
+
+ If we can get a total ordering from the table, such as via primary key
+ column(s), then return those too so that ordering generation can be
+ avoided.
+ """
+ if table_ref.dataset_id.upper() == "_SESSION":
+ # _SESSION tables aren't supported by the tables.get REST API.
+ return (
+ self.ibis_client.sql(
+ f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`"
+ ),
+ None,
+ )
+
+ table_expression = self.ibis_client.table(
+ table_ref.table_id,
+ database=f"{table_ref.project}.{table_ref.dataset_id}",
+ )
+
+ # If there are primary keys defined, the query engine assumes these
+ # columns are unique, even if the constraint is not enforced. We make
+ # the same assumption and use these columns as the total ordering keys.
+ table = self.bqclient.get_table(table_ref)
+
+ # TODO(b/305264153): Use public properties to fetch primary keys once
+ # added to google-cloud-bigquery.
+ primary_keys = (
+ table._properties.get("tableConstraints", {})
+ .get("primaryKey", {})
+ .get("columns")
+ )
+
+ if not primary_keys:
+ return table_expression, None
+ else:
+ # Read from a snapshot since we won't have to copy the table data to create a total ordering.
+ job_config = bigquery.QueryJobConfig()
+ job_config.labels["bigframes-api"] = api_name
+ current_timestamp = list(
+ self.bqclient.query(
+ "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
+ job_config=job_config,
+ ).result()
+ )[0][0]
+ table_expression = self.ibis_client.sql(
+ bigframes_io.create_snapshot_sql(table_ref, current_timestamp)
+ )
+ return table_expression, primary_keys
+
  def _read_gbq_table(
  self,
  query: str,
@@ -581,24 +639,19 @@ def _read_gbq_table(
  if max_results and max_results <= 0:
  raise ValueError("`max_results` should be a positive number.")
 
- # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so
- # these docstrings are inline.
  # TODO(swast): Can we re-use the temp table from other reads in the
  # session, if the original table wasn't modified?
  table_ref = bigquery.table.TableReference.from_string(
  query, default_project=self.bqclient.project
  )
 
- if table_ref.dataset_id.upper() == "_SESSION":
- # _SESSION tables aren't supported by the tables.get REST API.
- table_expression = self.ibis_client.sql(
- f"SELECT * FROM `_SESSION`.`{table_ref.table_id}`"
- )
- else:
- table_expression = self.ibis_client.table(
- table_ref.table_id,
- database=f"{table_ref.project}.{table_ref.dataset_id}",
- )
+ (
+ table_expression,
+ total_ordering_cols,
+ ) = self._read_gbq_table_to_ibis_with_total_ordering(
+ table_ref,
+ api_name=api_name,
+ )
 
  for key in col_order:
  if key not in table_expression.columns:
@@ -624,7 +677,34 @@ def _read_gbq_table(
  ordering = None
  is_total_ordering = False
 
- if len(index_cols) != 0:
+ if total_ordering_cols is not None:
+ # Note: currently, this a table has a total ordering only when the
+ # primary key(s) are set on a table. The query engine assumes such
+ # columns are unique, even if not enforced.
+ is_total_ordering = True
+ ordering = core.ExpressionOrdering(
+ ordering_value_columns=[
+ core.OrderingColumnReference(column_id)
+ for column_id in total_ordering_cols
+ ],
+ total_ordering_columns=frozenset(total_ordering_cols),
+ )
+
+ if len(index_cols) != 0:
+ index_labels = typing.cast(List[Optional[str]], index_cols)
+ else:
+ # Use the total_ordering_cols to project offsets to use as the default index.
+ table_expression = table_expression.order_by(index_cols)
+ default_index_id = guid.generate_guid("bigframes_index_")
+ default_index_col = (
+ ibis.row_number().cast(ibis_dtypes.int64).name(default_index_id)
+ )
+ table_expression = table_expression.mutate(
+ **{default_index_id: default_index_col}
+ )
+ index_cols = [default_index_id]
+ index_labels = [None]
+ elif len(index_cols) != 0:
  index_labels = typing.cast(List[Optional[str]], index_cols)
  distinct_table = table_expression.select(*index_cols).distinct()
  is_unique_sql = f"""WITH full_table AS (

@@ -17,6 +17,7 @@
 import logging
 import math
 import pathlib
+import textwrap
 import typing
 from typing import Dict, Optional
 
@@ -795,6 +796,36 @@ def penguins_randomforest_classifier_model_name(
  return model_name
 
 
+@pytest.fixture(scope="session")
+def usa_names_grouped_table(
+ session: bigframes.Session, dataset_id_permanent
+) -> bigquery.Table:
+ """Provides a table with primary key(s) set."""
+ table_id = f"{dataset_id_permanent}.usa_names_grouped"
+ try:
+ return session.bqclient.get_table(table_id)
+ except google.cloud.exceptions.NotFound:
+ query = textwrap.dedent(
+ f"""
+ CREATE TABLE `{dataset_id_permanent}.usa_names_grouped`
+ (
+ total_people INT64,
+ name STRING,
+ gender STRING,
+ year INT64,
+ PRIMARY KEY(name, gender, year) NOT ENFORCED
+ )
+ AS
+ SELECT SUM(`number`) AS total_people, name, gender, year
+ FROM `bigquery-public-data.usa_names.usa_1910_2013`
+ GROUP BY name, gender, year
+ """
+ )
+ job = session.bqclient.query(query)
+ job.result()
+ return session.bqclient.get_table(table_id)
+
+
 @pytest.fixture()
 def deferred_repr():
  bigframes.options.display.repr_mode = "deferred"

@@ -20,6 +20,7 @@
 from typing import List
 
 import google.api_core.exceptions
+import google.cloud.bigquery as bigquery
 import numpy as np
 import pandas as pd
 import pytest
@@ -231,6 +232,30 @@ def test_read_gbq_w_anonymous_query_results_table(session: bigframes.Session):
  pd.testing.assert_frame_equal(result, expected, check_dtype=False)
 
 
+def test_read_gbq_w_primary_keys_table(
+ session: bigframes.Session, usa_names_grouped_table: bigquery.Table
+):
+ table = usa_names_grouped_table
+ # TODO(b/305264153): Use public properties to fetch primary keys once
+ # added to google-cloud-bigquery.
+ primary_keys = (
+ table._properties.get("tableConstraints", {})
+ .get("primaryKey", {})
+ .get("columns")
+ )
+ assert len(primary_keys) != 0
+
+ df = session.read_gbq(f"{table.project}.{table.dataset_id}.{table.table_id}")
+ result = df.head(100).to_pandas()
+
+ # Verify that the DataFrame is already sorted by primary keys.
+ sorted_result = result.sort_values(primary_keys)
+ pd.testing.assert_frame_equal(result, sorted_result)
+
+ # Verify that we're working from a snapshot rather than a copy of the table.
+ assert "FOR SYSTEM_TIME AS OF TIMESTAMP" in df.sql
+
+
 @pytest.mark.parametrize(
  ("query_or_table", "max_results"),
  [