Skip to content

Commit 8ca17e6

Browse files
Merge remote-tracking branch 'github/main' into only_once
2 parents 7541289 + 37f8c32 commit 8ca17e6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1339
-220
lines changed

CHANGELOG.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,35 @@
44

55
[1]: https://pypi.org/project/bigframes/#history
66

7+
## [1.26.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.25.0...v1.26.0) (2024-11-12)
8+
9+
10+
### Features
11+
12+
* Add basic geopandas functionality ([#962](https://github.com/googleapis/python-bigquery-dataframes/issues/962)) ([3759c63](https://github.com/googleapis/python-bigquery-dataframes/commit/3759c6397eaa3c46c4142aa51ca22be3dc8e4971))
13+
* Support `json_extract_string_array` in the `bigquery` module ([#1131](https://github.com/googleapis/python-bigquery-dataframes/issues/1131)) ([4ef8bac](https://github.com/googleapis/python-bigquery-dataframes/commit/4ef8bacdcc5447ba53c0f354526346f4dec7c5a1))
14+
15+
16+
### Bug Fixes
17+
18+
* Fix Series.to_frame generating string label instead of int where name is None ([#1118](https://github.com/googleapis/python-bigquery-dataframes/issues/1118)) ([14e32b5](https://github.com/googleapis/python-bigquery-dataframes/commit/14e32b51c11c1718128f49ef94e754afc0ac0618))
19+
* Update the API documentation with newly added rep ([#1120](https://github.com/googleapis/python-bigquery-dataframes/issues/1120)) ([72c228b](https://github.com/googleapis/python-bigquery-dataframes/commit/72c228b15627e6047d60ae42740563a6dfea73da))
20+
21+
22+
### Performance Improvements
23+
24+
* Reduce CURRENT_TIMESTAMP queries ([#1114](https://github.com/googleapis/python-bigquery-dataframes/issues/1114)) ([32274b1](https://github.com/googleapis/python-bigquery-dataframes/commit/32274b130849b37d7e587643cf7b6d109455ff38))
25+
* Reduce dry runs from read_gbq with table ([#1129](https://github.com/googleapis/python-bigquery-dataframes/issues/1129)) ([f7e4354](https://github.com/googleapis/python-bigquery-dataframes/commit/f7e435488d630cf4cf493c89ecdde94a95a7a0d7))
26+
27+
28+
### Documentation
29+
30+
* Add file for Classification with a Boosted Treed Model and snippet for preparing sample data ([#1135](https://github.com/googleapis/python-bigquery-dataframes/issues/1135)) ([7ac6639](https://github.com/googleapis/python-bigquery-dataframes/commit/7ac6639fb0e8baf5fb3adf5785dffd8cf9b06702))
31+
* Add snippet for Linear Regression tutorial Predict Outcomes section ([#1101](https://github.com/googleapis/python-bigquery-dataframes/issues/1101)) ([108f4a9](https://github.com/googleapis/python-bigquery-dataframes/commit/108f4a98463596d8df6d381b3580eb72eab41b6e))
32+
* Update `DataFrame` docstrings to include the errors section ([#1127](https://github.com/googleapis/python-bigquery-dataframes/issues/1127)) ([a38d4c4](https://github.com/googleapis/python-bigquery-dataframes/commit/a38d4c422b6b312f6a54d7b1dd105a474ec2e91a))
33+
* Update GroupBy docstrings ([#1103](https://github.com/googleapis/python-bigquery-dataframes/issues/1103)) ([9867a78](https://github.com/googleapis/python-bigquery-dataframes/commit/9867a788e7c46bf0850cacbe7cd41a11fea32d6b))
34+
* Update Session doctrings to include exceptions ([#1130](https://github.com/googleapis/python-bigquery-dataframes/issues/1130)) ([a870421](https://github.com/googleapis/python-bigquery-dataframes/commit/a87042158b181dceee31124fe208926a3bb1071f))
35+
736
## [1.25.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.24.0...v1.25.0) (2024-10-29)
837

938

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from bigframes.bigquery._operations.json import (
2626
json_extract,
2727
json_extract_array,
28+
json_extract_string_array,
2829
json_set,
2930
)
3031
from bigframes.bigquery._operations.search import create_vector_index, vector_search
@@ -37,6 +38,7 @@
3738
"json_set",
3839
"json_extract",
3940
"json_extract_array",
41+
"json_extract_string_array",
4042
"approx_top_count",
4143
"struct",
4244
"create_vector_index",

bigframes/bigquery/_operations/json.py

Lines changed: 104 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,17 @@
2121

2222
from __future__ import annotations
2323

24-
from typing import Any, Sequence, Tuple
24+
from typing import Any, cast, Optional, Sequence, Tuple, Union
2525

26+
import bigframes.dtypes
2627
import bigframes.operations as ops
2728
import bigframes.series as series
2829

30+
from . import array
31+
2932

3033
def json_set(
31-
series: series.Series,
34+
input: series.Series,
3235
json_path_value_pairs: Sequence[Tuple[str, Any]],
3336
) -> series.Series:
3437
"""Produces a new JSON value within a Series by inserting or replacing values at
@@ -47,7 +50,7 @@ def json_set(
4750
Name: data, dtype: string
4851
4952
Args:
50-
series (bigframes.series.Series):
53+
input (bigframes.series.Series):
5154
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
5255
json_path_value_pairs (Sequence[Tuple[str, Any]]):
5356
Pairs of JSON path and the new value to insert/replace.
@@ -59,6 +62,7 @@ def json_set(
5962
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
6063
# create_if_missing is not currently implemented.
6164

65+
result = input
6266
for json_path_value_pair in json_path_value_pairs:
6367
if len(json_path_value_pair) != 2:
6468
raise ValueError(
@@ -67,14 +71,14 @@ def json_set(
6771
)
6872

6973
json_path, json_value = json_path_value_pair
70-
series = series._apply_binary_op(
74+
result = result._apply_binary_op(
7175
json_value, ops.JSONSet(json_path=json_path), alignment="left"
7276
)
73-
return series
77+
return result
7478

7579

7680
def json_extract(
77-
series: series.Series,
81+
input: series.Series,
7882
json_path: str,
7983
) -> series.Series:
8084
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
@@ -93,24 +97,24 @@ def json_extract(
9397
dtype: string
9498
9599
Args:
96-
series (bigframes.series.Series):
100+
input (bigframes.series.Series):
97101
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
98102
json_path (str):
99103
The JSON path identifying the data that you want to obtain from the input.
100104
101105
Returns:
102106
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
103107
"""
104-
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
108+
return input._apply_unary_op(ops.JSONExtract(json_path=json_path))
105109

106110

107111
def json_extract_array(
108-
series: series.Series,
112+
input: series.Series,
109113
json_path: str = "$",
110114
) -> series.Series:
111-
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
112-
values. This function uses single quotes and brackets to escape invalid JSONPath
113-
characters in JSON keys.
115+
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted
116+
`STRING` or `JSON` values. This function uses single quotes and brackets to
117+
escape invalid JSONPath characters in JSON keys.
114118
115119
**Examples:**
116120
@@ -124,13 +128,98 @@ def json_extract_array(
124128
1 ['4' '5']
125129
dtype: list<item: string>[pyarrow]
126130
131+
>>> s = bpd.Series([
132+
... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
133+
... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
134+
... ])
135+
>>> bbq.json_extract_array(s, "$.fruits")
136+
0 ['{"name":"apple"}' '{"name":"cherry"}']
137+
1 ['{"name":"guava"}' '{"name":"grapes"}']
138+
dtype: list<item: string>[pyarrow]
139+
140+
>>> s = bpd.Series([
141+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
142+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
143+
... ])
144+
>>> bbq.json_extract_array(s, "$.fruits.names")
145+
0 ['"apple"' '"cherry"']
146+
1 ['"guava"' '"grapes"']
147+
dtype: list<item: string>[pyarrow]
148+
127149
Args:
128-
series (bigframes.series.Series):
150+
input (bigframes.series.Series):
129151
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
130152
json_path (str):
131153
The JSON path identifying the data that you want to obtain from the input.
132154
133155
Returns:
134-
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
156+
bigframes.series.Series: A new Series with the parsed arrays from the input.
135157
"""
136-
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
158+
return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
159+
160+
161+
def json_extract_string_array(
162+
input: series.Series,
163+
json_path: str = "$",
164+
value_dtype: Optional[
165+
Union[bigframes.dtypes.Dtype, bigframes.dtypes.DtypeString]
166+
] = None,
167+
) -> series.Series:
168+
"""Extracts a JSON array and converts it to a SQL array of `STRING` values.
169+
A `value_dtype` can be provided to further coerce the data type of the
170+
values in the array. This function uses single quotes and brackets to escape
171+
invalid JSONPath characters in JSON keys.
172+
173+
**Examples:**
174+
175+
>>> import bigframes.pandas as bpd
176+
>>> import bigframes.bigquery as bbq
177+
>>> bpd.options.display.progress_bar = None
178+
179+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
180+
>>> bbq.json_extract_string_array(s)
181+
0 ['1' '2' '3']
182+
1 ['4' '5']
183+
dtype: list<item: string>[pyarrow]
184+
185+
>>> bbq.json_extract_string_array(s, value_dtype='Int64')
186+
0 [1 2 3]
187+
1 [4 5]
188+
dtype: list<item: int64>[pyarrow]
189+
190+
>>> s = bpd.Series([
191+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
192+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
193+
... ])
194+
>>> bbq.json_extract_string_array(s, "$.fruits.names")
195+
0 ['apple' 'cherry']
196+
1 ['guava' 'grapes']
197+
dtype: list<item: string>[pyarrow]
198+
199+
Args:
200+
input (bigframes.series.Series):
201+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
202+
json_path (str):
203+
The JSON path identifying the data that you want to obtain from the input.
204+
value_dtype (dtype, Optional):
205+
The data type supported by BigFrames DataFrame.
206+
207+
Returns:
208+
bigframes.series.Series: A new Series with the parsed arrays from the input.
209+
"""
210+
array_series = input._apply_unary_op(
211+
ops.JSONExtractStringArray(json_path=json_path)
212+
)
213+
if value_dtype not in [None, bigframes.dtypes.STRING_DTYPE]:
214+
array_items_series = array_series.explode()
215+
if value_dtype == bigframes.dtypes.BOOL_DTYPE:
216+
array_items_series = array_items_series.str.lower() == "true"
217+
else:
218+
array_items_series = array_items_series.astype(value_dtype)
219+
array_series = cast(
220+
series.Series,
221+
array.array_agg(
222+
array_items_series.groupby(level=input.index.names, dropna=False)
223+
),
224+
)
225+
return array_series

bigframes/core/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,13 @@ def promote_offsets(self) -> Tuple[ArrayValue, str]:
268268
def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:
269269
"""Append together multiple ArrayValue objects."""
270270
return ArrayValue(
271-
nodes.ConcatNode(children=tuple([self.node, *[val.node for val in other]]))
271+
nodes.ConcatNode(
272+
children=tuple([self.node, *[val.node for val in other]]),
273+
output_ids=tuple(
274+
ids.ColumnId(bigframes.core.guid.generate_guid())
275+
for id in self.column_ids
276+
),
277+
)
272278
)
273279

274280
def compute_values(self, assignments: Sequence[ex.Expression]):

bigframes/core/blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3137,7 +3137,7 @@ def _pd_index_to_array_value(
31373137
rows = []
31383138
labels_as_tuples = utils.index_as_tuples(index)
31393139
for row_offset in range(len(index)):
3140-
id_gen = bigframes.core.identifiers.standard_identifiers()
3140+
id_gen = bigframes.core.identifiers.standard_id_strings()
31413141
row_label = labels_as_tuples[row_offset]
31423142
row_label = (row_label,) if not isinstance(row_label, tuple) else row_label
31433143
row = {}

bigframes/core/compile/api.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818
import google.cloud.bigquery as bigquery
1919

2020
import bigframes.core.compile.compiler as compiler
21-
import bigframes.core.rewrite as rewrites
2221

2322
if TYPE_CHECKING:
2423
import bigframes.core.nodes
2524
import bigframes.core.ordering
2625
import bigframes.core.schema
2726

28-
_STRICT_COMPILER = compiler.Compiler(strict=True)
27+
_STRICT_COMPILER = compiler.Compiler(
28+
strict=True, enable_pruning=True, enable_densify_ids=True
29+
)
2930

3031

3132
class SQLCompiler:
@@ -34,7 +35,7 @@ def __init__(self, strict: bool = True):
3435

3536
def compile_peek(self, node: bigframes.core.nodes.BigFrameNode, n_rows: int) -> str:
3637
"""Compile node into sql that selects N arbitrary rows, may not execute deterministically."""
37-
return self._compiler.compile_unordered_ir(node).peek_sql(n_rows)
38+
return self._compiler.compile_peek_sql(node, n_rows)
3839

3940
def compile_unordered(
4041
self,
@@ -44,9 +45,8 @@ def compile_unordered(
4445
) -> str:
4546
"""Compile node into sql where rows are unsorted, and no ordering information is preserved."""
4647
# TODO: Enable limit pullup, but only if not being used to write to clustered table.
47-
return self._compiler.compile_unordered_ir(node).to_sql(
48-
col_id_overrides=col_id_overrides
49-
)
48+
output_ids = [col_id_overrides.get(id, id) for id in node.schema.names]
49+
return self._compiler.compile_sql(node, ordered=False, output_ids=output_ids)
5050

5151
def compile_ordered(
5252
self,
@@ -56,10 +56,8 @@ def compile_ordered(
5656
) -> str:
5757
"""Compile node into sql where rows are sorted with ORDER BY."""
5858
# If we are ordering the query anyways, compiling the slice as a limit is probably a good idea.
59-
new_node, limit = rewrites.pullup_limit_from_slice(node)
60-
return self._compiler.compile_ordered_ir(new_node).to_sql(
61-
col_id_overrides=col_id_overrides, ordered=True, limit=limit
62-
)
59+
output_ids = [col_id_overrides.get(id, id) for id in node.schema.names]
60+
return self._compiler.compile_sql(node, ordered=True, output_ids=output_ids)
6361

6462
def compile_raw(
6563
self,
@@ -68,13 +66,12 @@ def compile_raw(
6866
str, Sequence[bigquery.SchemaField], bigframes.core.ordering.RowOrdering
6967
]:
7068
"""Compile node into sql that exposes all columns, including hidden ordering-only columns."""
71-
ir = self._compiler.compile_ordered_ir(node)
72-
sql, schema = ir.raw_sql_and_schema()
73-
return sql, schema, ir._ordering
69+
return self._compiler.compile_raw(node)
7470

7571

7672
def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode):
7773
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
74+
node = _STRICT_COMPILER._preprocess(node)
7875
ibis = _STRICT_COMPILER.compile_ordered_ir(node)._to_ibis_expr(
7976
ordering_mode="unordered"
8077
)
@@ -85,9 +82,10 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
8582
"""Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
8683
import bigframes.core.schema
8784

85+
node = _STRICT_COMPILER._preprocess(node)
8886
compiled = _STRICT_COMPILER.compile_unordered_ir(node)
8987
items = tuple(
90-
bigframes.core.schema.SchemaItem(id, compiled.get_column_type(id))
91-
for id in compiled.column_ids
88+
bigframes.core.schema.SchemaItem(name, compiled.get_column_type(ibis_id))
89+
for name, ibis_id in zip(node.schema.names, compiled.column_ids)
9290
)
9391
return bigframes.core.schema.ArraySchema(items)

0 commit comments

Comments
 (0)