Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cdec057
feat: add `allow_large_results` option to `read_gbq_query`
tswast Jul 24, 2025
3b78da7
add system test
tswast Jul 24, 2025
ad11cc8
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Jul 24, 2025
68f1a10
add to pandas module
tswast Jul 24, 2025
b1e31e3
Merge branch 'main' into allow_large_results
tswast Aug 20, 2025
3b770ae
default to global option
tswast Aug 20, 2025
d01cdcb
Merge remote-tracking branch 'origin/allow_large_results' into allow_…
tswast Aug 20, 2025
10a8302
fix unit test
tswast Aug 20, 2025
938fb89
tweak imports so I can manually run doctest
tswast Aug 21, 2025
78d29f0
support index_col and columns
tswast Aug 21, 2025
435c602
add system tests and fix pandas warning
tswast Aug 21, 2025
7199fee
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Aug 21, 2025
778746f
use global option in remaining functions
tswast Aug 21, 2025
766640a
Merge remote-tracking branch 'origin/main' into allow_large_results
tswast Aug 21, 2025
95b2fdf
supply allow_large_results=False when max_results is set
tswast Aug 21, 2025
05c145c
fix last? failing test
tswast Aug 21, 2025
e87b548
Merge branch 'main' into allow_large_results
tswast Aug 21, 2025
58f45f8
fix vector search tests
tswast Aug 21, 2025
d611337
Merge remote-tracking branch 'origin/allow_large_results' into allow_…
tswast Aug 21, 2025
6d0fe48
fix vector search tests again
tswast Aug 21, 2025
8c27c2c
fix arima tests
tswast Aug 21, 2025
5ab21a9
fix more tests
tswast Aug 21, 2025
2ca9e9e
try again
tswast Aug 21, 2025
0c8b88c
and again
tswast Aug 21, 2025
2a29309
Merge branch 'main' into allow_large_results
tswast Sep 2, 2025
9362838
exclude ML results from jobless query path
tswast Sep 2, 2025
d3997ba
fix unit tests
tswast Sep 2, 2025
7cb3d55
add parameter for vector_search too
tswast Sep 2, 2025
0a70173
fix doctest
tswast Sep 2, 2025
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions bigframes/pandas/io/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -200,6 +201,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -215,6 +217,7 @@ def read_gbq(
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False,
allow_large_results: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should allow_large_results also default to None? This would allow it to inherit its value from ComputeOptions.allow_large_results.

Copy link
Collaborator Author

@tswast tswast Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we do so, it'd technicaly be a breaking change. Users with query results > 10 GB would have to set this option to True.

Might be worth it though for the consistency with other places, though?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query_or_table)
return global_session.with_default_session(
Expand All @@ -228,6 +231,7 @@ def read_gbq(
use_cache=use_cache,
col_order=col_order,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down Expand Up @@ -391,6 +395,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> bigframes.dataframe.DataFrame:
...

Expand All @@ -407,6 +412,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: vendored_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -422,6 +428,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: vendored_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: bool = True,
) -> bigframes.dataframe.DataFrame | pandas.Series:
_set_default_session_location_if_possible(query)
return global_session.with_default_session(
Expand All @@ -435,6 +442,7 @@ def read_gbq_query(
col_order=col_order,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)


Expand Down
89 changes: 83 additions & 6 deletions bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ def read_gbq( # type: ignore[overload-overlap]
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -410,6 +411,7 @@ def read_gbq(
use_cache: Optional[bool] = ...,
col_order: Iterable[str] = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -424,8 +426,8 @@ def read_gbq(
filters: third_party_pandas_gbq.FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
dry_run: bool = False
# Add a verify index argument that fails if the index is not unique.
dry_run: bool = False,
allow_large_results: bool = True,
) -> dataframe.DataFrame | pandas.Series:
# TODO(b/281571214): Generate prompt to show the progress of read_gbq.
if columns and col_order:
Expand All @@ -445,6 +447,7 @@ def read_gbq(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)
else:
if configuration is not None:
Expand Down Expand Up @@ -551,6 +554,7 @@ def read_gbq_query( # type: ignore[overload-overlap]
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[False] = ...,
allow_large_results: bool = ...,
) -> dataframe.DataFrame:
...

Expand All @@ -567,6 +571,7 @@ def read_gbq_query(
col_order: Iterable[str] = ...,
filters: third_party_pandas_gbq.FiltersType = ...,
dry_run: Literal[True] = ...,
allow_large_results: bool = ...,
) -> pandas.Series:
...

Expand All @@ -582,6 +587,7 @@ def read_gbq_query(
col_order: Iterable[str] = (),
filters: third_party_pandas_gbq.FiltersType = (),
dry_run: bool = False,
allow_large_results: bool = True,
) -> dataframe.DataFrame | pandas.Series:
"""Turn a SQL query into a DataFrame.

Expand Down Expand Up @@ -631,9 +637,48 @@ def read_gbq_query(

See also: :meth:`Session.read_gbq`.

Args:
query (str):
A SQL query to execute.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the query result. If not
specified, all columns will be read.
configuration (dict, optional):
A dictionary of query job configuration options. See the
BigQuery REST API documentation for a list of available options:
https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query
max_results (int, optional):
The maximum number of rows to retrieve from the query
result. If not specified, all rows will be loaded.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the query. Defaults to
``False``.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size.
Defaults to ``True``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the result of the query. If ``dry_run``
is ``True``, a ``pandas.Series`` containing query statistics is
returned.

Raises:
ValueError:
Expand All @@ -657,6 +702,7 @@ def read_gbq_query(
use_cache=use_cache,
filters=filters,
dry_run=dry_run,
allow_large_results=allow_large_results,
)

@overload
Expand Down Expand Up @@ -714,9 +760,40 @@ def read_gbq_table(

See also: :meth:`Session.read_gbq`.

Args:
table_id (str):
The identifier of the BigQuery table to read.
index_col (Iterable[str] or str, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used.
columns (Iterable[str], optional):
The columns to read from the table. If not specified, all
columns will be read.
max_results (int, optional):
The maximum number of rows to retrieve from the table. If not
specified, all rows will be loaded.
filters (list[tuple], optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to ``True``.
Setting this to ``False`` will force a re-execution of the query.
col_order (Iterable[str], optional):
The desired order of columns in the resulting DataFrame. This
parameter is deprecated and will be removed in a future version.
Use ``columns`` instead.
dry_run (bool, optional):
If ``True``, the function will not actually execute the query but
will instead return statistics about the table. Defaults to
``False``.

Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
bigframes.pandas.DataFrame or pandas.Series:
A DataFrame representing the contents of the table. If
``dry_run`` is ``True``, a ``pandas.Series`` containing table
statistics is returned.

Raises:
ValueError:
Expand Down
32 changes: 32 additions & 0 deletions tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import pytest

import bigframes
import bigframes.core.nodes as nodes
import bigframes.dataframe
import bigframes.dtypes
import bigframes.ml.linear_model
Expand Down Expand Up @@ -640,6 +641,37 @@ def test_read_gbq_with_configuration(
assert df.shape == (9, 3)


def test_read_gbq_query_w_allow_large_results(session: bigframes.Session):
if not hasattr(session.bqclient, "default_job_creation_mode"):
pytest.skip("Jobless query only available on newer google-cloud-bigquery.")

query = "SELECT 1"

# Make sure we don't get a cached table.
configuration = {"query": {"useQueryCache": False}}

# Very small results should wrap a local node.
df_false = session.read_gbq(
query,
configuration=configuration,
allow_large_results=False,
)
assert df_false.shape == (1, 1)
roots_false = df_false._get_block().expr.node.roots
assert any(isinstance(node, nodes.ReadLocalNode) for node in roots_false)
assert not any(isinstance(node, nodes.ReadTableNode) for node in roots_false)

# Large results allowed should wrap a table.
df_true = session.read_gbq(
query,
configuration=configuration,
allow_large_results=True,
)
assert df_true.shape == (1, 1)
roots_true = df_true._get_block().expr.node.roots
assert any(isinstance(node, nodes.ReadTableNode) for node in roots_true)


def test_read_gbq_with_custom_global_labels(
session: bigframes.Session, scalars_table_id: str
):
Expand Down
6 changes: 6 additions & 0 deletions third_party/bigframes_vendored/pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def read_gbq(
filters: FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
allow_large_results: bool = True,
):
"""Loads a DataFrame from BigQuery.

Expand Down Expand Up @@ -156,6 +157,11 @@ def read_gbq(
`configuration` to avoid conflicts.
col_order (Iterable[str]):
Alias for columns, retained for backwards compatibility.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size. This
option is only applicable when ``query_or_table`` is a query.
Defaults to ``True``.

Raises:
bigframes.exceptions.DefaultIndexWarning:
Expand Down