pandas-dev · mroeschke · Dec 28, 2018 · Jun 8, 2018 · Jun 9, 2018 · Jun 10, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -16,6 +16,7 @@ Other Enhancements
 - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
 - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
 - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`)
+- :func:`~pandas.DataFrame.to_sql` add parameter ``method`` to control SQL insertion clause (:8953:)
 
 
 .. _whatsnew_0240.api_breaking:

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2014,7 +2014,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
  **kwargs)
 
  def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
- index_label=None, chunksize=None, dtype=None):
+ index_label=None, chunksize=None, dtype=None, method='default'):
  """
  Write records stored in a DataFrame to a SQL database.
 
@@ -2052,6 +2052,8 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
  Specifying the datatype for columns. The keys should be the column
  names and the values should be the SQLAlchemy types or strings for
  the sqlite3 legacy mode.
+ method : {'default', 'multi', callable}, default 'default'
+ Controls the SQL insertion clause used.
 
  Raises
  ------
@@ -2120,11 +2122,59 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
 
  >>> engine.execute("SELECT * FROM integers").fetchall()
  [(1,), (None,), (2,)]
+
+ Insertion method:
+
+ .. versionadded:: 0.24.0
+
+ The parameter ``method`` controls the SQL insertion clause used.
+ Possible values are:
+
+ - `'default'`: Uses standard SQL `INSERT` clause
+ - `'multi'`: Pass multiple values in a single `INSERT` clause.
+ It uses a **special** SQL syntax not supported by all backends.
+ This usually provides a big performance for Analytic databases
+ like *Presto* and *Redshit*, but has worse performance for
+ traditional SQL backend if the table contains many columns.
+ For more information check SQLAlchemy `documention <http://docs.sqlalchemy.org/en/latest/core/dml.html?highlight=multivalues#sqlalchemy.sql.expression.Insert.values.params.*args>`__.
+ - callable: with signature `(pd_table, conn, keys, data_iter)`.
+ This can be used to implement more performant insertion based on
+ specific backend dialect features.
+ I.e. using *Postgresql* `COPY clause
+ <https://www.postgresql.org/docs/current/static/sql-copy.html>`__.
+ Check API for details and a sample implementation
+ :func:`~pandas.DataFrame.to_sql`.
+
+
+ Example of callable for Postgresql *COPY*::
+
+ # Alternative to_sql() *method* for DBs that support COPY FROM
+ import csv
+ from io import StringIO
+
+ def psql_insert_copy(table, conn, keys, data_iter):
+ # gets a DBAPI connection that can provide a cursor
+ dbapi_conn = conn.connection
+ with dbapi_conn.cursor() as cur:
+ s_buf = StringIO()
+ writer = csv.writer(s_buf)
+ writer.writerows(data_iter)
+ s_buf.seek(0)
+
+ columns = ', '.join('"{}"'.format(k) for k in keys)
+ if table.schema:
+ table_name = '{}.{}'.format(table.schema, table.name)
+ else:
+ table_name = table.name
+
+ sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
+ table_name, columns)
+ cur.copy_expert(sql=sql, file=s_buf)
  """
  from pandas.io import sql
  sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
  index=index, index_label=index_label, chunksize=chunksize,
- dtype=dtype)
+ dtype=dtype, method=method)
 
  def to_pickle(self, path, compression='infer',
  protocol=pkl.HIGHEST_PROTOCOL):

diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -6,6 +6,7 @@
 
 from __future__ import print_function, division
 from datetime import datetime, date, time
+from functools import partial
 
 import warnings
 import re
@@ -398,7 +399,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
 
 
 def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
- index_label=None, chunksize=None, dtype=None):
+ index_label=None, chunksize=None, dtype=None, method='default'):
  """
  Write records stored in a DataFrame to a SQL database.
 
@@ -432,6 +433,8 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
  Optional specifying the datatype for columns. The SQL type should
  be a SQLAlchemy type, or a string for sqlite3 fallback connection.
  If all columns are of the same type, one single value can be used.
+ method : {'default', 'multi', callable}, default 'default'
+ Controls the SQL insertion clause used.
 
  """
  if if_exists not in ('fail', 'replace', 'append'):
@@ -447,7 +450,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
 
  pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
  index_label=index_label, schema=schema,
- chunksize=chunksize, dtype=dtype)
+ chunksize=chunksize, dtype=dtype, method=method)
 
 
 def has_table(table_name, con, schema=None):
@@ -572,8 +575,29 @@ def create(self):
  else:
  self._execute_create()
 
- def insert_statement(self):
- return self.table.insert()
+ def _execute_insert(self, conn, keys, data_iter):
+ """Execute SQL statement inserting data
+
+ Parameters
+ ----------
+ conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
+ keys : list of str
+ Column names
+ data_iter : generator of list
+ Each item contains a list of values to be inserted
+ """
+ data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+ conn.execute(self.table.insert(), data)
+
+ def _execute_insert_multi(self, conn, keys, data_iter):
+ """Alternative to _execute_insert for DBs support multivalue INSERT.
+
+ Note: multi-value insert is usually faster for analytics DBs
+ and tables containing a few columns
+ but performance degrades quickly with increase of columns.
+ """
+ data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+ conn.execute(self.table.insert(data))
 
  def insert_data(self):
  if self.index is not None:
@@ -611,11 +635,18 @@ def insert_data(self):
 
  return column_names, data_list
 
- def _execute_insert(self, conn, keys, data_iter):
- data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
- conn.execute(self.insert_statement(), data)
+ def insert(self, chunksize=None, method=None):
+
+ # set insert method
+ if method in (None, 'default'):
+ exec_insert = self._execute_insert
+ elif method == 'multi':
+ exec_insert = self._execute_insert_multi
+ elif callable(method):
+ exec_insert = partial(method, self)
+ else:
+ raise ValueError('Invalid parameter `method`: {}'.format(method))
 
- def insert(self, chunksize=None):
  keys, data_list = self.insert_data()
 
  nrows = len(self.frame)
@@ -638,7 +669,7 @@ def insert(self, chunksize=None):
  break
 
  chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
- self._execute_insert(conn, keys, chunk_iter)
+ exec_insert(conn, keys, chunk_iter)
 
  def _query_iterator(self, result, chunksize, columns, coerce_float=True,
  parse_dates=None):
@@ -1078,7 +1109,8 @@ def read_query(self, sql, index_col=None, coerce_float=True,
  read_sql = read_query
 
  def to_sql(self, frame, name, if_exists='fail', index=True,
- index_label=None, schema=None, chunksize=None, dtype=None):
+ index_label=None, schema=None, chunksize=None, dtype=None,
+ method='default'):
  """
  Write records stored in a DataFrame to a SQL database.
 
@@ -1108,7 +1140,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
  Optional specifying the datatype for columns. The SQL type should
  be a SQLAlchemy type. If all columns are of the same type, one
  single value can be used.
-
+ method : {'default', 'multi', callable}, default 'default'
+ Controls the SQL insertion clause used.
  """
  if dtype and not is_dict_like(dtype):
  dtype = {col_name: dtype for col_name in frame}
@@ -1124,7 +1157,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
  if_exists=if_exists, index_label=index_label,
  schema=schema, dtype=dtype)
  table.create()
- table.insert(chunksize)
+ table.insert(chunksize, method=method)
  if (not name.isdigit() and not name.islower()):
  # check for potentially case sensitivity issues (GH7815)
  # Only check when name is not a number and name is not lower case
@@ -1434,7 +1467,8 @@ def _fetchall_as_list(self, cur):
  return result
 
  def to_sql(self, frame, name, if_exists='fail', index=True,
- index_label=None, schema=None, chunksize=None, dtype=None):
+ index_label=None, schema=None, chunksize=None, dtype=None,
+ method='default'):
  """
  Write records stored in a DataFrame to a SQL database.
 
@@ -1463,7 +1497,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
  Optional specifying the datatype for columns. The SQL type should
  be a string. If all columns are of the same type, one single value
  can be used.
-
+ method : {'default', 'multi', callable}, default 'default'
+ Controls the SQL insertion clause used.
  """
  if dtype and not is_dict_like(dtype):
  dtype = {col_name: dtype for col_name in frame}
@@ -1478,7 +1513,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
  if_exists=if_exists, index_label=index_label,
  dtype=dtype)
  table.create()
- table.insert(chunksize)
+ table.insert(chunksize, method)
 
  def has_table(self, name, schema=None):
  # TODO(wesm): unused?

diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -372,12 +372,16 @@ def _read_sql_iris_named_parameter(self):
  iris_frame = self.pandasSQL.read_query(query, params=params)
  self._check_iris_loaded_frame(iris_frame)
 
- def _to_sql(self):
+ def _to_sql(self, method=None):
  self.drop_table('test_frame1')
 
- self.pandasSQL.to_sql(self.test_frame1, 'test_frame1')
+ self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=method)
  assert self.pandasSQL.has_table('test_frame1')
 
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+ assert num_rows == num_entries
+
  # Nuke table
  self.drop_table('test_frame1')
 
@@ -431,6 +435,25 @@ def _to_sql_append(self):
  assert num_rows == num_entries
  self.drop_table('test_frame1')
 
+ def _to_sql_method_callable(self):
+ check = [] # used to double check function below is really being used
+
+ def sample(pd_table, conn, keys, data_iter):
+ check.append(1)
+ data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+ conn.execute(pd_table.table.insert(), data)
+ self.drop_table('test_frame1')
+
+ self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=sample)
+ assert self.pandasSQL.has_table('test_frame1')
+
+ assert check == [1]
+ num_entries = len(self.test_frame1)
+ num_rows = self._count_rows('test_frame1')
+ assert num_rows == num_entries
+ # Nuke table
+ self.drop_table('test_frame1')
+
  def _roundtrip(self):
  self.drop_table('test_frame_roundtrip')
  self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip')
@@ -1180,7 +1203,7 @@ def setup_connect(self):
  pytest.skip(
  "Can't connect to {0} server".format(self.flavor))
 
- def test_aread_sql(self):
+ def test_read_sql(self):
  self._read_sql_iris()
 
  def test_read_sql_parameter(self):
@@ -1204,6 +1227,12 @@ def test_to_sql_replace(self):
  def test_to_sql_append(self):
  self._to_sql_append()
 
+ def test_to_sql_method_multi(self):
+ self._to_sql(method='multi')
+
+ def test_to_sql_method_callable(self):
+ self._to_sql_method_callable()
+
  def test_create_table(self):
  temp_conn = self.connect()
  temp_frame = DataFrame(