pandas-dev · medullaskyline · May 5, 2016 · jreback · Nov 16, 2016 · parthea
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4386,10 +4386,17 @@ a ``TableCreationError`` if the destination table already exists.
 
 .. note::
 
- If the ``if_exists`` argument is set to ``'append'``, the destination dataframe will
- be written to the table using the defined table schema and column types. The
- dataframe must match the destination table in column order, structure, and
- data types.
+ If the ``if_exists`` argument is set to ``'append'``, the dataframe will be written
+ to the destination table using the defined table schema and column types. The
+ dataframe must match the destination table in column order and the following attributes:
+
+ * ``name``
+ * ``type``
+ * ``mode``
+
+ The destination table's columns must all have ``NULLABLE`` modes.
+
+
  If the ``if_exists`` argument is set to ``'replace'``, and the existing table has a
  different schema, a delay of 2 minutes will be forced to ensure that the new schema
  has propagated in the Google environment. See
@@ -4454,12 +4461,12 @@ produce the dictionary representation schema of the specified pandas DataFrame.
 
  In [10]: gbq.generate_bq_schema(df, default_type='STRING')
 
- Out[10]: {'fields': [{'name': 'my_bool1', 'type': 'BOOLEAN'},
- {'name': 'my_bool2', 'type': 'BOOLEAN'},
- {'name': 'my_dates', 'type': 'TIMESTAMP'},
- {'name': 'my_float64', 'type': 'FLOAT'},
- {'name': 'my_int64', 'type': 'INTEGER'},
- {'name': 'my_string', 'type': 'STRING'}]}
+ Out[10]: {'fields': [{'name': 'my_bool1', 'type': 'BOOLEAN', 'mode': 'NULLABLE'},
+ {'name': 'my_bool2', 'type': 'BOOLEAN', 'mode': 'NULLABLE',
+ {'name': 'my_dates', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'},
+ {'name': 'my_float64', 'type': 'FLOAT', 'mode': 'NULLABLE'},
+ {'name': 'my_int64', 'type': 'INTEGER', 'mode': 'NULLABLE'},
+ {'name': 'my_string', 'type': 'STRING', 'mode': 'NULLABLE'}]}
 
 .. note::
 

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -377,3 +377,5 @@ Bug Fixes
 
 
 - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
+
+- Bug in ``DataFrame.to_gbq()`` where dataframes could not be appended to Google BigQuery tables that had been created in any way other than ``DataFrame.to_gbq()``.
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -477,11 +477,17 @@ def verify_schema(self, dataset_id, table_id, schema):
  from apiclient.errors import HttpError
 
  try:
- return (self.service.tables().get(
+ remote_schema = self.service.tables().get(
  projectId=self.project_id,
  datasetId=dataset_id,
  tableId=table_id
- ).execute()['schema']) == schema
+ ).execute()['schema']
+
+ modified_schema_field_list = [
+ {key: field[key] for key in field if key != 'description'}
+ for field in remote_schema['fields']]
+
+ return {'fields': modified_schema_field_list} == schema
 
  except HttpError as ex:
  self.process_http_error(ex)
@@ -717,7 +723,11 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000,
  raise InvalidSchema("Please verify that the column order, "
  "structure and data types in the "
  "DataFrame match the schema of the "
- "destination table.")
+ "destination table. All columns in the "
+ "destination table must be NULLABLE. "
+ "To change REQUIRED columns to NULLABLE, "
+ "see https://cloud.google.com/bigquery/"
+ "docs/tables#updateschema")
  else:
  table.create(table_id, table_schema)
 
@@ -756,7 +766,8 @@ def _generate_bq_schema(df, default_type='STRING'):
  fields = []
  for column_name, dtype in df.dtypes.iteritems():
  fields.append({'name': column_name,
- 'type': type_mapping.get(dtype.kind, default_type)})
+ 'type': type_mapping.get(dtype.kind, default_type),
+ 'mode': 'NULLABLE'})
 
  return {'fields': fields}
 

diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -4,12 +4,9 @@
 import pytz
 import platform
 from time import sleep
-
 import numpy as np
-
 from distutils.version import StrictVersion
 from pandas import compat
-
 from pandas import NaT
 from pandas.compat import u, range
 from pandas.core.frame import DataFrame
@@ -75,10 +72,8 @@ def _test_imports():
  try:
  from apiclient.discovery import build # noqa
  from apiclient.errors import HttpError # noqa
-
  from oauth2client.client import OAuth2WebServerFlow # noqa
  from oauth2client.client import AccessTokenRefreshError # noqa
-
  from oauth2client.file import Storage # noqa
  from oauth2client.tools import run_flow # noqa
  _GOOGLE_API_CLIENT_INSTALLED = True
@@ -185,7 +180,6 @@ def test_generate_bq_schema_deprecated():
 
 
 class TestGBQConnectorIntegration(tm.TestCase):
-
  def setUp(self):
  test_requirements()
 
@@ -279,6 +273,15 @@ def test_should_be_able_to_get_results_from_query(self):
 class GBQUnitTests(tm.TestCase):
  def setUp(self):
  test_requirements()
+ self.sut_public = gbq.GbqConnector('bigquery-public-data')
+ self.names_df = DataFrame(
+ {'state': '', 'gender': '', 'year': [1],
+ 'name': '', 'number': [1]},
+ columns=['state', 'gender', 'year', 'name', 'number'])
+ self.shakes_df = DataFrame(
+ {'word': 'foo', 'word_count': [1],
+ 'corpus': 'bar', 'corpus_date': [1]},
+ columns=['word', 'word_count', 'corpus', 'corpus_date'])
 
  def test_should_return_bigquery_integers_as_python_floats(self):
  result = gbq._parse_entry(1, 'INTEGER')
@@ -349,9 +352,45 @@ def test_read_gbq_with_corrupted_private_key_json_should_fail(self):
  'SELECT 1', project_id='x',
  private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS))
 
+ def test_df_with_matching_schema_should_verify_schema(self):
+ schema = gbq._generate_bq_schema(self.names_df)
+ self.assertTrue(self.sut_public.verify_schema(
+ 'usa_names', 'usa_1910_2013', schema))
+
+ def test_bq_table_with_required_fields_should_fail_verify_schema(self):
+ schema = gbq._generate_bq_schema(self.shakes_df)
+ self.assertFalse(
+ self.sut_public.verify_schema('samples', 'shakespeare', schema))
+
+ def test_append_to_table_w_required_cols_should_raise_InvalidSchema(self):
+ with tm.assertRaises(gbq.InvalidSchema):
+ gbq.to_gbq(self.shakes_df, 'samples.shakespeare',
+ 'bigquery-public-data', if_exists='append')
+
+ def test_verify_schema_should_ignore_column_description(self):
+ bigquery_service = self.sut_public.get_service()
+
+ remote_schema = bigquery_service.tables().get(
+ projectId='bigquery-public-data',
+ datasetId='usa_names',
+ tableId='usa_1910_2013'
+ ).execute()['schema']
+
+ gen_schema = gbq._generate_bq_schema(self.names_df)
+
+ # prove that the schemas are identical except for description
+ f = lambda x, y: set(x.keys()) - set(y.keys())
+ key_set_list = map(f, remote_schema['fields'], gen_schema['fields'])
+ list_of_keys_in_remote_schema_only = list(
+ reduce(lambda x, y: set.union(x, y), key_set_list))
+ self.assertEqual(list_of_keys_in_remote_schema_only, ['description'])
+
+ self.assertTrue(self.sut_public.verify_schema('usa_names',
+ 'usa_1910_2013',
+ gen_schema))
 
-class TestReadGBQIntegration(tm.TestCase):
 
+class TestReadGBQIntegration(tm.TestCase):
  @classmethod
  def setUpClass(cls):
  # - GLOBAL CLASS FIXTURES -
@@ -493,7 +532,7 @@ def test_column_order(self):
  result_frame = gbq.read_gbq(
  query, project_id=PROJECT_ID, col_order=col_order)
  correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [
-  'b'], 'STRING_3': ['c']})[col_order]
+ 'b'], 'STRING_3': ['c']})[col_order]
  tm.assert_frame_equal(result_frame, correct_frame)
 
  def test_column_order_plus_index(self):
@@ -677,10 +716,11 @@ def test_generate_schema(self):
  df = tm.makeMixedDataFrame()
  schema = gbq._generate_bq_schema(df)
 
- test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'},
- {'name': 'B', 'type': 'FLOAT'},
- {'name': 'C', 'type': 'STRING'},
- {'name': 'D', 'type': 'TIMESTAMP'}]}
+ test_schema = {'fields': [
+ {'name': 'A', 'type': 'FLOAT', 'mode': 'NULLABLE'},
+ {'name': 'B', 'type': 'FLOAT', 'mode': 'NULLABLE'},
+ {'name': 'C', 'type': 'STRING', 'mode': 'NULLABLE'},
+ {'name': 'D', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'}]}
 
  self.assertEqual(schema, test_schema)
 
@@ -721,6 +761,23 @@ def test_list_table(self):
  'Expected table list to contain table {0}'
  .format(destination_table))
 
+ def test_table_with_required_columns_shouldnt_append(self):
+ destination_table = TABLE_ID + "10"
+ test_size = 10
+ df = make_mixed_dataframe_v2(test_size)
+ schema = gbq._generate_bq_schema(df)
+ schema['fields'][0]['mode'] = 'REQUIRED'
+
+ # initialize table with same schema as the
+ # dataframe but with one REQUIRED column
+ self.table.create(destination_table, schema)
+
+ # Try appending to a table with similar schema
+ # but with REQUIRED columns, confirm failure
+ with tm.assertRaises(gbq.InvalidSchema):
+ gbq.to_gbq(df, DATASET_ID + "1." + destination_table,
+ PROJECT_ID, chunksize=1000, if_exists='append')
+
  def test_list_dataset(self):
  dataset_id = DATASET_ID + "1"
  self.assertTrue(dataset_id in self.dataset.datasets(),
@@ -824,7 +881,6 @@ def test_upload_data_as_service_account_with_key_path(self):
  "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table),
  project_id=PROJECT_ID,
  private_key=PRIVATE_KEY_JSON_PATH)
-
  self.assertEqual(result['NUM_ROWS'][0], test_size)
 
 
@@ -885,6 +941,7 @@ def test_upload_data_as_service_account_with_key_contents(self):
  private_key=PRIVATE_KEY_JSON_CONTENTS)
  self.assertEqual(result['NUM_ROWS'][0], test_size)
 
+
 if __name__ == '__main__':
  nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
  exit=False)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -377,3 +377,5 @@ Bug Fixes


		- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)

		- Bug in ``DataFrame.to_gbq()`` where dataframes could not be appended to Google BigQuery tables that had been created in any way other than ``DataFrame.to_gbq()``.
Copy link Contributor jreback Nov 16, 2016 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. can you move to 0.19.2