Skip to content
14 changes: 8 additions & 6 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame:
def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str:
"""Create query text representing this dataframe for I/O."""
array_value = self._block.expr

new_col_labels, new_idx_labels = utils.get_standardized_ids(
self._block.column_labels, self.index.names
)

columns = list(self._block.value_columns)
column_labels = list(self._block.column_labels)
column_labels = new_col_labels
# This code drops unnamed indexes to keep consistent with the behavior of
# most pandas write APIs. The exception is `pandas.to_csv`, which keeps
# unnamed indexes as `Unnamed: 0`.
# TODO(chelsealin): check if works for multiple indexes.
if index and self.index.name is not None:
columns.extend(self._block.index_columns)
column_labels.extend(self.index.names)
column_labels.extend(new_idx_labels)
else:
array_value = array_value.drop_columns(self._block.index_columns)

# Make columns in SQL reflect _labels_ not _ids_. Note: This may use
# the arbitrary unicode column labels feature in BigQuery, which is
# currently (June 2023) in preview.
# TODO(swast): Handle duplicate and NULL labels.
id_overrides = {
col_id: col_label
for col_id, col_label in zip(columns, column_labels)
if col_label and isinstance(col_label, str)
col_id: col_label for col_id, col_label in zip(columns, column_labels)
}

if ordering_id is not None:
Expand Down
44 changes: 44 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,50 @@ def test_to_gbq_if_exists(
)


def test_to_gbq_w_duplicate_column_names(
scalars_df_index, scalars_pandas_df_index, dataset_id
):
"""Test the `to_gbq` API when dealing with duplicate column names."""
destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names"

# Renaming 'int64_too' to 'int64_col', which will result in 'int64_too'
# becoming 'int64_col_1' after deduplication.
scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"})
scalars_df_index.to_gbq(destination_table, if_exists="replace")

bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()

pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
)
pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_too"],
bf_result["int64_col_1"],
check_names=False,
)


def test_to_gbq_w_None_column_names(
scalars_df_index, scalars_pandas_df_index, dataset_id
):
"""Test the `to_gbq` API with None as a column name."""
destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names"

scalars_df_index = scalars_df_index.rename(columns={"int64_too": None})
scalars_df_index.to_gbq(destination_table, if_exists="replace")

bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas()

pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_col"], bf_result["int64_col"]
)
pd.testing.assert_series_equal(
scalars_pandas_df_index["int64_too"],
bf_result["bigframes_unnamed_column"],
check_names=False,
)


def test_to_gbq_w_invalid_destination_table(scalars_df_index):
with pytest.raises(ValueError):
scalars_df_index.to_gbq("table_id")
Expand Down