googleapis
diff --git a/‎bigframes/core/local_data.py‎
Lines changed: 64 additions & 12 deletions b/‎bigframes/core/local_data.py‎
Lines changed: 64 additions & 12 deletions
diff --git a/‎bigframes/core/utils.py‎
Lines changed: 3 additions & 0 deletions b/‎bigframes/core/utils.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bigframes/session/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎bigframes/session/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎bigframes/session/clients.py‎
Lines changed: 31 additions & 0 deletions b/‎bigframes/session/clients.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎bigframes/session/loader.py‎
Lines changed: 58 additions & 5 deletions b/‎bigframes/session/loader.py‎
Lines changed: 58 additions & 5 deletions
diff --git a/‎setup.py‎
Lines changed: 2 additions & 0 deletions b/‎setup.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/system/small/test_dataframe.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/system/small/test_dataframe.py‎
Lines changed: 2 additions & 1 deletion
@@ -96,27 +96,65 @@ def from_pyarrow(self, table: pa.Table) -> ManagedArrowTable:
  schemata.ArraySchema(tuple(fields)),
  )
 
- def to_parquet(
+ def to_arrow(
  self,
- dst: Union[str, io.IOBase],
  *,
  offsets_col: Optional[str] = None,
  geo_format: Literal["wkb", "wkt"] = "wkt",
  duration_type: Literal["int", "duration"] = "duration",
  json_type: Literal["string"] = "string",
- ):
- pa_table = self.data
- if offsets_col is not None:
- pa_table = pa_table.append_column(
- offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64())
- )
+ ) -> tuple[pa.Schema, Iterable[pa.RecordBatch]]:
  if geo_format != "wkt":
  raise NotImplementedError(f"geo format {geo_format} not yet implemented")
- if duration_type != "duration":
- raise NotImplementedError(
- f"duration as {duration_type} not yet implemented"
- )
  assert json_type == "string"
+
+ batches = self.data.to_batches()
+ schema = self.data.schema
+ if duration_type == "int":
+
+ @_recursive_map_types
+ def durations_to_ints(type: pa.DataType) -> pa.DataType:
+ if pa.types.is_duration(type):
+ return pa.int64()
+ return type
+
+ schema = pa.schema(
+ pa.field(field.name, durations_to_ints(field.type))
+ for field in self.data.schema
+ )
+
+ # Can't use RecordBatch.cast until set higher min pyarrow version
+ def convert_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
+ return pa.record_batch(
+ [arr.cast(type) for arr, type in zip(batch.columns, schema.types)],
+ schema=schema,
+ )
+
+ batches = map(convert_batch, batches)
+
+ if offsets_col is not None:
+ return schema.append(pa.field(offsets_col, pa.int64())), _append_offsets(
+ batches, offsets_col
+ )
+ else:
+ return schema, batches
+
+ def to_parquet(
+ self,
+ dst: Union[str, io.IOBase],
+ *,
+ offsets_col: Optional[str] = None,
+ geo_format: Literal["wkb", "wkt"] = "wkt",
+ duration_type: Literal["int", "duration"] = "duration",
+ json_type: Literal["string"] = "string",
+ ):
+ schema, batches = self.to_arrow(
+ offsets_col=offsets_col,
+ geo_format=geo_format,
+ duration_type=duration_type,
+ json_type=json_type,
+ )
+ pa_table = pa.Table.from_batches(batches, schema=schema)
  pyarrow.parquet.write_table(pa_table, where=dst)
 
  def itertuples(
@@ -329,3 +367,17 @@ def _physical_type_replacements(dtype: pa.DataType) -> pa.DataType:
  if dtype in _ARROW_MANAGED_STORAGE_OVERRIDES:
  return _ARROW_MANAGED_STORAGE_OVERRIDES[dtype]
  return dtype
+
+
+def _append_offsets(
+ batches: Iterable[pa.RecordBatch], offsets_col_name: str
+) -> Iterable[pa.RecordBatch]:
+ offset = 0
+ for batch in batches:
+ offsets = pa.array(range(offset, offset + batch.num_rows), type=pa.int64())
+ batch_w_offsets = pa.record_batch(
+ [*batch.columns, offsets],
+ schema=batch.schema.append(pa.field(offsets_col_name, pa.int64())),
+ )
+ offset += batch.num_rows
+ yield batch_w_offsets
@@ -142,6 +142,9 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str:
  identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier)
  if not identifier:
  identifier = "id"
+ elif identifier[0].isdigit():
+ # first character must be letter or underscore
+ identifier = "_" + identifier
  return identifier
 
 
 
@@ -255,6 +255,7 @@ def __init__(
  session=self,
  bqclient=self._clients_provider.bqclient,
  storage_manager=self._temp_storage_manager,
+ write_client=self._clients_provider.bqstoragewriteclient,
  default_index_type=self._default_index_type,
  scan_index_uniqueness=self._strictly_ordered,
  force_total_order=self._strictly_ordered,
@@ -805,6 +806,10 @@ def _read_pandas(
  return self._loader.read_pandas(
  pandas_dataframe, method="stream", api_name=api_name
  )
+ elif write_engine == "bigquery_write":
+ return self._loader.read_pandas(
+ pandas_dataframe, method="write", api_name=api_name
+ )
  else:
  raise ValueError(f"Got unexpected write_engine '{write_engine}'")
 
 
@@ -134,6 +134,9 @@ def __init__(
  self._bqstoragereadclient: Optional[
  google.cloud.bigquery_storage_v1.BigQueryReadClient
  ] = None
+ self._bqstoragewriteclient: Optional[
+ google.cloud.bigquery_storage_v1.BigQueryWriteClient
+ ] = None
  self._cloudfunctionsclient: Optional[
  google.cloud.functions_v2.FunctionServiceClient
  ] = None
@@ -238,6 +241,34 @@ def bqstoragereadclient(self):
 
  return self._bqstoragereadclient
 
+ @property
+ def bqstoragewriteclient(self):
+ if not self._bqstoragewriteclient:
+ bqstorage_options = None
+ if "bqstoragewriteclient" in self._client_endpoints_override:
+ bqstorage_options = google.api_core.client_options.ClientOptions(
+ api_endpoint=self._client_endpoints_override["bqstoragewriteclient"]
+ )
+ elif self._use_regional_endpoints:
+ bqstorage_options = google.api_core.client_options.ClientOptions(
+ api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format(
+ location=self._location
+ )
+ )
+
+ bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo(
+ user_agent=self._application_name
+ )
+ self._bqstoragewriteclient = (
+ google.cloud.bigquery_storage_v1.BigQueryWriteClient(
+ client_info=bqstorage_info,
+ client_options=bqstorage_options,
+ credentials=self._credentials,
+ )
+ )
+
+ return self._bqstoragewriteclient
+
  @property
  def cloudfunctionsclient(self):
  if not self._cloudfunctionsclient:
 
@@ -23,6 +23,7 @@
 import typing
 from typing import (
  Dict,
+ Generator,
  Hashable,
  IO,
  Iterable,
@@ -36,12 +37,13 @@
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq
 import google.api_core.exceptions
+from google.cloud import bigquery_storage_v1
 import google.cloud.bigquery as bigquery
-import google.cloud.bigquery.table
+from google.cloud.bigquery_storage_v1 import types as bq_storage_types
 import pandas
 import pyarrow as pa
 
-from bigframes.core import local_data, utils
+from bigframes.core import guid, local_data, utils
 import bigframes.core as core
 import bigframes.core.blocks as blocks
 import bigframes.core.schema as schemata
@@ -142,13 +144,15 @@ def __init__(
  self,
  session: bigframes.session.Session,
  bqclient: bigquery.Client,
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
  storage_manager: bigframes.session.temporary_storage.TemporaryStorageManager,
  default_index_type: bigframes.enums.DefaultIndexKind,
  scan_index_uniqueness: bool,
  force_total_order: bool,
  metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None,
  ):
  self._bqclient = bqclient
+ self._write_client = write_client
  self._storage_manager = storage_manager
  self._default_index_type = default_index_type
  self._scan_index_uniqueness = scan_index_uniqueness
@@ -165,7 +169,7 @@ def __init__(
  def read_pandas(
  self,
  pandas_dataframe: pandas.DataFrame,
- method: Literal["load", "stream"],
+ method: Literal["load", "stream", "write"],
  api_name: str,
  ) -> dataframe.DataFrame:
  # TODO: Push this into from_pandas, along with index flag
@@ -183,6 +187,8 @@ def read_pandas(
  array_value = self.load_data(managed_data, api_name=api_name)
  elif method == "stream":
  array_value = self.stream_data(managed_data)
+ elif method == "write":
+ array_value = self.write_data(managed_data)
  else:
  raise ValueError(f"Unsupported read method {method}")
 
@@ -198,7 +204,7 @@ def load_data(
  self, data: local_data.ManagedArrowTable, api_name: Optional[str] = None
  ) -> core.ArrayValue:
  """Load managed data into bigquery"""
- ordering_col = "bf_load_job_offsets"
+ ordering_col = guid.generate_guid("load_offsets_")
 
  # JSON support incomplete
  for item in data.schema.items:
@@ -244,7 +250,7 @@ def load_data(
 
  def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue:
  """Load managed data into bigquery"""
- ordering_col = "bf_stream_job_offsets"
+ ordering_col = guid.generate_guid("stream_offsets_")
  schema_w_offsets = data.schema.append(
  schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE)
  )
@@ -277,6 +283,53 @@ def stream_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue:
  n_rows=data.data.num_rows,
  ).drop_columns([ordering_col])
 
+ def write_data(self, data: local_data.ManagedArrowTable) -> core.ArrayValue:
+ """Load managed data into bigquery"""
+ ordering_col = guid.generate_guid("stream_offsets_")
+ schema_w_offsets = data.schema.append(
+ schemata.SchemaItem(ordering_col, bigframes.dtypes.INT_DTYPE)
+ )
+ bq_schema = schema_w_offsets.to_bigquery(_STREAM_JOB_TYPE_OVERRIDES)
+ bq_table_ref = self._storage_manager.create_temp_table(
+ bq_schema, [ordering_col]
+ )
+
+ requested_stream = bq_storage_types.stream.WriteStream()
+ requested_stream.type_ = bq_storage_types.stream.WriteStream.Type.COMMITTED # type: ignore
+
+ stream_request = bq_storage_types.CreateWriteStreamRequest(
+ parent=bq_table_ref.to_bqstorage(), write_stream=requested_stream
+ )
+ stream = self._write_client.create_write_stream(request=stream_request)
+
+ def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]:
+ schema, batches = data.to_arrow(
+ offsets_col=ordering_col, duration_type="int"
+ )
+ for batch in batches:
+ request = bq_storage_types.AppendRowsRequest(write_stream=stream.name)
+ request.arrow_rows.writer_schema.serialized_schema = (
+ schema.serialize().to_pybytes()
+ )
+ request.arrow_rows.rows.serialized_record_batch = (
+ batch.serialize().to_pybytes()
+ )
+ yield request
+
+ for response in self._write_client.append_rows(requests=request_gen()):
+ if response.row_errors:
+ raise ValueError(
+ f"Problem loading at least one row from DataFrame: {response.row_errors}. {constants.FEEDBACK_LINK}"
+ )
+ destination_table = self._bqclient.get_table(bq_table_ref)
+ return core.ArrayValue.from_table(
+ table=destination_table,
+ schema=schema_w_offsets,
+ session=self._session,
+ offsets_col=ordering_col,
+ n_rows=data.data.num_rows,
+ ).drop_columns([ordering_col])
+
  def _start_generic_job(self, job: formatting_helpers.GenericJob):
  if bigframes.options.display.progress_bar is not None:
  formatting_helpers.wait_for_job(
 
@@ -42,6 +42,8 @@
  "google-cloud-bigtable >=2.24.0",
  "google-cloud-pubsub >=2.21.4",
  "google-cloud-bigquery[bqstorage,pandas] >=3.31.0",
+ # 2.30 needed for arrow support.
+ "google-cloud-bigquery-storage >= 2.30.0, < 3.0.0",
  "google-cloud-functions >=1.12.0",
  "google-cloud-bigquery-connection >=1.12.0",
  "google-cloud-iam >=2.12.1",
 
@@ -84,6 +84,7 @@ def test_df_construct_pandas_default(scalars_dfs):
  ("bigquery_inline"),
  ("bigquery_load"),
  ("bigquery_streaming"),
+ ("bigquery_write"),
  ],
 )
 def test_read_pandas_all_nice_types(
@@ -1769,7 +1770,7 @@ def test_len(scalars_dfs):
 )
 @pytest.mark.parametrize(
  "write_engine",
- ["bigquery_load", "bigquery_streaming"],
+ ["bigquery_load", "bigquery_streaming", "bigquery_write"],
 )
 def test_df_len_local(session, n_rows, write_engine):
  assert (
Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def test_df_construct_pandas_default(scalars_dfs):`
`84`	`84`	`("bigquery_inline"),`
`85`	`85`	`("bigquery_load"),`
`86`	`86`	`("bigquery_streaming"),`
	`87`	`+ ("bigquery_write"),`
`87`	`88`	`],`
`88`	`89`	`)`
`89`	`90`	`def test_read_pandas_all_nice_types(`
`@@ -1769,7 +1770,7 @@ def test_len(scalars_dfs):`
`1769`	`1770`	`)`
`1770`	`1771`	`@pytest.mark.parametrize(`
`1771`	`1772`	`"write_engine",`
`1772`		`- ["bigquery_load", "bigquery_streaming"],`
	`1773`	`+ ["bigquery_load", "bigquery_streaming", "bigquery_write"],`
`1773`	`1774`	`)`
`1774`	`1775`	`def test_df_len_local(session, n_rows, write_engine):`
`1775`	`1776`	`assert (`