Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4386,10 +4386,17 @@ a ``TableCreationError`` if the destination table already exists.

.. note::

If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
be written to the table using the defined table schema and column types. The
dataframe must match the destination table in column order, structure, and
data types.
If the ``if_exists`` argument is set to ``'append'``, the dataframe will be written
to the destination table using the defined table schema and column types. The
dataframe must match the destination table in column order and the following attributes:

* ``name``
* ``type``
* ``mode``

The destination table's columns must all have ``NULLABLE`` modes.


If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
different schema, a delay of 2 minutes will be forced to ensure that the new schema
has propagated in the Google environment. See
Expand Down Expand Up @@ -4454,12 +4461,12 @@ produce the dictionary representation schema of the specified pandas DataFrame.

In [10]: gbq.generate_bq_schema(df, default_type='STRING')

Out[10]: {'fields': [{'name': 'my_bool1', 'type': 'BOOLEAN'},
{'name': 'my_bool2', 'type': 'BOOLEAN'},
{'name': 'my_dates', 'type': 'TIMESTAMP'},
{'name': 'my_float64', 'type': 'FLOAT'},
{'name': 'my_int64', 'type': 'INTEGER'},
{'name': 'my_string', 'type': 'STRING'}]}
Out[10]: {'fields': [{'name': 'my_bool1', 'type': 'BOOLEAN', 'mode': 'NULLABLE'},
{'name': 'my_bool2', 'type': 'BOOLEAN', 'mode': 'NULLABLE',
{'name': 'my_dates', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'},
{'name': 'my_float64', 'type': 'FLOAT', 'mode': 'NULLABLE'},
{'name': 'my_int64', 'type': 'INTEGER', 'mode': 'NULLABLE'},
{'name': 'my_string', 'type': 'STRING', 'mode': 'NULLABLE'}]}

.. note::

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -377,3 +377,5 @@ Bug Fixes


- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)

- Bug in ``DataFrame.to_gbq()`` where dataframes could not be appended to Google BigQuery tables that had been created in any way other than ``DataFrame.to_gbq()``.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move to 0.19.2

19 changes: 15 additions & 4 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,11 +477,17 @@ def verify_schema(self, dataset_id, table_id, schema):
from apiclient.errors import HttpError

try:
return (self.service.tables().get(
remote_schema = self.service.tables().get(
projectId=self.project_id,
datasetId=dataset_id,
tableId=table_id
).execute()['schema']) == schema
).execute()['schema']

modified_schema_field_list = [
{key: field[key] for key in field if key != 'description'}
Copy link
Contributor

@parthea parthea May 8, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a unit test to confirm that 'description' is ignored when comparing the schema of an existing table?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! You could argue that test_df_with_matching_schema_should_verify_schema confirms that 'description' is ignored, but I see your point.

So I added test_verify_schema_should_ignore_column_description. The idea was to first prove that 'description' was the one and only key in the BigQuery table's schema that doesn't exist in the schema generated from the DataFrame. Then if the generated schema is verified with verify_schema, that proves description is in fact ignored. Feedback welcome.

There is one test that fails but that's because of this commit in testing.assert_index_equal.

for field in remote_schema['fields']]

return {'fields': modified_schema_field_list} == schema

except HttpError as ex:
self.process_http_error(ex)
Expand Down Expand Up @@ -717,7 +723,11 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
raise InvalidSchema("Please verify that the column order, "
"structure and data types in the "
"DataFrame match the schema of the "
"destination table.")
"destination table. All columns in the "
"destination table must be NULLABLE. "
"To change REQUIRED columns to NULLABLE, "
"see https://cloud.google.com/bigquery/"
"docs/tables#updateschema")
else:
table.create(table_id, table_schema)

Expand Down Expand Up @@ -756,7 +766,8 @@ def _generate_bq_schema(df, default_type='STRING'):
fields = []
for column_name, dtype in df.dtypes.iteritems():
fields.append({'name': column_name,
'type': type_mapping.get(dtype.kind, default_type)})
'type': type_mapping.get(dtype.kind, default_type),
'mode': 'NULLABLE'})

return {'fields': fields}

Expand Down
83 changes: 70 additions & 13 deletions pandas/io/tests/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@
import pytz
import platform
from time import sleep

import numpy as np

from distutils.version import StrictVersion
from pandas import compat

from pandas import NaT
from pandas.compat import u, range
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -75,10 +72,8 @@ def _test_imports():
try:
from apiclient.discovery import build # noqa
from apiclient.errors import HttpError # noqa

from oauth2client.client import OAuth2WebServerFlow # noqa
from oauth2client.client import AccessTokenRefreshError # noqa

from oauth2client.file import Storage # noqa
from oauth2client.tools import run_flow # noqa
_GOOGLE_API_CLIENT_INSTALLED = True
Expand Down Expand Up @@ -185,7 +180,6 @@ def test_generate_bq_schema_deprecated():


class TestGBQConnectorIntegration(tm.TestCase):

def setUp(self):
test_requirements()

Expand Down Expand Up @@ -279,6 +273,15 @@ def test_should_be_able_to_get_results_from_query(self):
class GBQUnitTests(tm.TestCase):
def setUp(self):
test_requirements()
self.sut_public = gbq.GbqConnector('bigquery-public-data')
self.names_df = DataFrame(
{'state': '', 'gender': '', 'year': [1],
'name': '', 'number': [1]},
columns=['state', 'gender', 'year', 'name', 'number'])
self.shakes_df = DataFrame(
{'word': 'foo', 'word_count': [1],
'corpus': 'bar', 'corpus_date': [1]},
columns=['word', 'word_count', 'corpus', 'corpus_date'])

def test_should_return_bigquery_integers_as_python_floats(self):
result = gbq._parse_entry(1, 'INTEGER')
Expand Down Expand Up @@ -349,9 +352,45 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self):
'SELECT 1', project_id='x',
private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS))

def test_df_with_matching_schema_should_verify_schema(self):
schema = gbq._generate_bq_schema(self.names_df)
self.assertTrue(self.sut_public.verify_schema(
'usa_names', 'usa_1910_2013', schema))

def test_bq_table_with_required_fields_should_fail_verify_schema(self):
schema = gbq._generate_bq_schema(self.shakes_df)
self.assertFalse(
self.sut_public.verify_schema('samples', 'shakespeare', schema))

def test_append_to_table_w_required_cols_should_raise_InvalidSchema(self):
with tm.assertRaises(gbq.InvalidSchema):
gbq.to_gbq(self.shakes_df, 'samples.shakespeare',
'bigquery-public-data', if_exists='append')

def test_verify_schema_should_ignore_column_description(self):
bigquery_service = self.sut_public.get_service()

remote_schema = bigquery_service.tables().get(
projectId='bigquery-public-data',
datasetId='usa_names',
tableId='usa_1910_2013'
).execute()['schema']

gen_schema = gbq._generate_bq_schema(self.names_df)

# prove that the schemas are identical except for description
f = lambda x, y: set(x.keys()) - set(y.keys())
key_set_list = map(f, remote_schema['fields'], gen_schema['fields'])
list_of_keys_in_remote_schema_only = list(
reduce(lambda x, y: set.union(x, y), key_set_list))
self.assertEqual(list_of_keys_in_remote_schema_only, ['description'])

self.assertTrue(self.sut_public.verify_schema('usa_names',
'usa_1910_2013',
gen_schema))

class TestReadGBQIntegration(tm.TestCase):

class TestReadGBQIntegration(tm.TestCase):
@classmethod
def setUpClass(cls):
# - GLOBAL CLASS FIXTURES -
Expand Down Expand Up @@ -493,7 +532,7 @@ def test_column_order(self):
result_frame = gbq.read_gbq(
query, project_id=PROJECT_ID, col_order=col_order)
correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [
'b'], 'STRING_3': ['c']})[col_order]
'b'], 'STRING_3': ['c']})[col_order]
tm.assert_frame_equal(result_frame, correct_frame)

def test_column_order_plus_index(self):
Expand Down Expand Up @@ -677,10 +716,11 @@ def test_generate_schema(self):
df = tm.makeMixedDataFrame()
schema = gbq._generate_bq_schema(df)

test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
{'name': 'B', 'type': 'FLOAT'},
{'name': 'C', 'type': 'STRING'},
{'name': 'D', 'type': 'TIMESTAMP'}]}
test_schema = {'fields': [
{'name': 'A', 'type': 'FLOAT', 'mode': 'NULLABLE'},
{'name': 'B', 'type': 'FLOAT', 'mode': 'NULLABLE'},
{'name': 'C', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name': 'D', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'}]}

self.assertEqual(schema, test_schema)

Expand Down Expand Up @@ -721,6 +761,23 @@ def test_list_table(self):
'Expected table list to contain table {0}'
.format(destination_table))

def test_table_with_required_columns_shouldnt_append(self):
destination_table = TABLE_ID + "10"
test_size = 10
df = make_mixed_dataframe_v2(test_size)
schema = gbq._generate_bq_schema(df)
schema['fields'][0]['mode'] = 'REQUIRED'

# initialize table with same schema as the
# dataframe but with one REQUIRED column
self.table.create(destination_table, schema)

# Try appending to a table with similar schema
# but with REQUIRED columns, confirm failure
with tm.assertRaises(gbq.InvalidSchema):
gbq.to_gbq(df, DATASET_ID + "1." + destination_table,
PROJECT_ID, chunksize=1000, if_exists='append')

def test_list_dataset(self):
dataset_id = DATASET_ID + "1"
self.assertTrue(dataset_id in self.dataset.datasets(),
Expand Down Expand Up @@ -824,7 +881,6 @@ def test_upload_data_as_service_account_with_key_path(self):
"SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
project_id=PROJECT_ID,
private_key=PRIVATE_KEY_JSON_PATH)

self.assertEqual(result['NUM_ROWS'][0], test_size)


Expand Down Expand Up @@ -885,6 +941,7 @@ def test_upload_data_as_service_account_with_key_contents(self):
private_key=PRIVATE_KEY_JSON_CONTENTS)
self.assertEqual(result['NUM_ROWS'][0], test_size)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)