-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: to_sql() add parameter "method" to control insertions method (#8… #21401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
a77cdfd 21e8c04 1e5d1cc 085313c 236e2c0 f4ffbfc b49792b 59cbdf7 7bdf6f3 44f321a d5ccabf 643e9bf 6fc6a26 87730f3 f36710b 19f9dfa c0bf457 19ce379 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -2014,7 +2014,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): | |
| **kwargs) | ||
| | ||
| def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | ||
| index_label=None, chunksize=None, dtype=None): | ||
| index_label=None, chunksize=None, dtype=None, method='default'): | ||
| """ | ||
| Write records stored in a DataFrame to a SQL database. | ||
| | ||
| | @@ -2052,6 +2052,8 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | |
| Specifying the datatype for columns. The keys should be the column | ||
| names and the values should be the SQLAlchemy types or strings for | ||
| the sqlite3 legacy mode. | ||
| method : {'default', 'multi', callable}, default 'default' | ||
| Controls the SQL insertion clause used. | ||
| | ||
| Raises | ||
| ------ | ||
| | @@ -2120,11 +2122,59 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, | |
| | ||
| >>> engine.execute("SELECT * FROM integers").fetchall() | ||
| [(1,), (None,), (2,)] | ||
| | ||
| Insertion method: | ||
| | ||
| .. versionadded:: 0.24.0 | ||
| | ||
| The parameter ``method`` controls the SQL insertion clause used. | ||
| Possible values are: | ||
| | ||
| - `'default'`: Uses standard SQL `INSERT` clause | ||
| ||
| - `'multi'`: Pass multiple values in a single `INSERT` clause. | ||
| It uses a **special** SQL syntax not supported by all backends. | ||
| ||
| This usually provides a big performance for Analytic databases | ||
| like *Presto* and *Redshit*, but has worse performance for | ||
| traditional SQL backend if the table contains many columns. | ||
| For more information check SQLAlchemy `documention <http://docs.sqlalchemy.org/en/latest/core/dml.html?highlight=multivalues#sqlalchemy.sql.expression.Insert.values.params.*args>`__. | ||
| ||
| - callable: with signature `(pd_table, conn, keys, data_iter)`. | ||
| This can be used to implement more performant insertion based on | ||
| ||
| specific backend dialect features. | ||
| I.e. using *Postgresql* `COPY clause | ||
| <https://www.postgresql.org/docs/current/static/sql-copy.html>`__. | ||
| Check API for details and a sample implementation | ||
| :func:`~pandas.DataFrame.to_sql`. | ||
| | ||
| | ||
| Example of callable for Postgresql *COPY*:: | ||
| | ||
| # Alternative to_sql() *method* for DBs that support COPY FROM | ||
| import csv | ||
| from io import StringIO | ||
| | ||
| def psql_insert_copy(table, conn, keys, data_iter): | ||
| # gets a DBAPI connection that can provide a cursor | ||
| dbapi_conn = conn.connection | ||
| with dbapi_conn.cursor() as cur: | ||
| s_buf = StringIO() | ||
| writer = csv.writer(s_buf) | ||
| writer.writerows(data_iter) | ||
| s_buf.seek(0) | ||
| | ||
| columns = ', '.join('"{}"'.format(k) for k in keys) | ||
| if table.schema: | ||
| table_name = '{}.{}'.format(table.schema, table.name) | ||
| else: | ||
| table_name = table.name | ||
| | ||
| sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format( | ||
| table_name, columns) | ||
| cur.copy_expert(sql=sql, file=s_buf) | ||
| ||
| """ | ||
| from pandas.io import sql | ||
| sql.to_sql(self, name, con, schema=schema, if_exists=if_exists, | ||
| index=index, index_label=index_label, chunksize=chunksize, | ||
| dtype=dtype) | ||
| dtype=dtype, method=method) | ||
| | ||
| def to_pickle(self, path, compression='infer', | ||
| protocol=pkl.HIGHEST_PROTOCOL): | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -6,6 +6,7 @@ | |
| | ||
| from __future__ import print_function, division | ||
| from datetime import datetime, date, time | ||
| from functools import partial | ||
| | ||
| import warnings | ||
| import re | ||
| | @@ -398,7 +399,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, | |
| | ||
| | ||
| def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | ||
| index_label=None, chunksize=None, dtype=None): | ||
| index_label=None, chunksize=None, dtype=None, method='default'): | ||
jreback marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| """ | ||
| Write records stored in a DataFrame to a SQL database. | ||
| | ||
| | @@ -432,6 +433,8 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | |
| Optional specifying the datatype for columns. The SQL type should | ||
| be a SQLAlchemy type, or a string for sqlite3 fallback connection. | ||
| If all columns are of the same type, one single value can be used. | ||
| method : {'default', 'multi', callable}, default 'default' | ||
| Controls the SQL insertion clause used. | ||
| | ||
| """ | ||
| if if_exists not in ('fail', 'replace', 'append'): | ||
| | @@ -447,7 +450,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, | |
| | ||
| pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, | ||
| index_label=index_label, schema=schema, | ||
| chunksize=chunksize, dtype=dtype) | ||
| chunksize=chunksize, dtype=dtype, method=method) | ||
| | ||
| | ||
| def has_table(table_name, con, schema=None): | ||
| | @@ -572,8 +575,29 @@ def create(self): | |
| else: | ||
| self._execute_create() | ||
| | ||
| def insert_statement(self): | ||
| return self.table.insert() | ||
| def _execute_insert(self, conn, keys, data_iter): | ||
| """Execute SQL statement inserting data | ||
| | ||
| Parameters | ||
| ---------- | ||
| conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection | ||
| keys : list of str | ||
| Column names | ||
| data_iter : generator of list | ||
| Each item contains a list of values to be inserted | ||
| """ | ||
| data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
| conn.execute(self.table.insert(), data) | ||
| | ||
| def _execute_insert_multi(self, conn, keys, data_iter): | ||
| """Alternative to _execute_insert for DBs support multivalue INSERT. | ||
| | ||
| Note: multi-value insert is usually faster for analytics DBs | ||
| and tables containing a few columns | ||
| but performance degrades quickly with increase of columns. | ||
| """ | ||
| data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
| conn.execute(self.table.insert(data)) | ||
| | ||
| def insert_data(self): | ||
| if self.index is not None: | ||
| | @@ -611,11 +635,18 @@ def insert_data(self): | |
| | ||
| return column_names, data_list | ||
| | ||
| def _execute_insert(self, conn, keys, data_iter): | ||
| data = [{k: v for k, v in zip(keys, row)} for row in data_iter] | ||
| conn.execute(self.insert_statement(), data) | ||
| def insert(self, chunksize=None, method=None): | ||
| | ||
| # set insert method | ||
| if method in (None, 'default'): | ||
| ||
| exec_insert = self._execute_insert | ||
| elif method == 'multi': | ||
| exec_insert = self._execute_insert_multi | ||
| elif callable(method): | ||
| exec_insert = partial(method, self) | ||
| else: | ||
| raise ValueError('Invalid parameter `method`: {}'.format(method)) | ||
| | ||
| def insert(self, chunksize=None): | ||
| keys, data_list = self.insert_data() | ||
| | ||
| nrows = len(self.frame) | ||
| | @@ -638,7 +669,7 @@ def insert(self, chunksize=None): | |
| break | ||
| | ||
| chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) | ||
| self._execute_insert(conn, keys, chunk_iter) | ||
| exec_insert(conn, keys, chunk_iter) | ||
| | ||
| def _query_iterator(self, result, chunksize, columns, coerce_float=True, | ||
| parse_dates=None): | ||
| | @@ -1078,7 +1109,8 @@ def read_query(self, sql, index_col=None, coerce_float=True, | |
| read_sql = read_query | ||
| | ||
| def to_sql(self, frame, name, if_exists='fail', index=True, | ||
| index_label=None, schema=None, chunksize=None, dtype=None): | ||
| index_label=None, schema=None, chunksize=None, dtype=None, | ||
| method='default'): | ||
| """ | ||
| Write records stored in a DataFrame to a SQL database. | ||
| | ||
| | @@ -1108,7 +1140,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
| Optional specifying the datatype for columns. The SQL type should | ||
| be a SQLAlchemy type. If all columns are of the same type, one | ||
| single value can be used. | ||
| | ||
| method : {'default', 'multi', callable}, default 'default' | ||
| Controls the SQL insertion clause used. | ||
| """ | ||
| if dtype and not is_dict_like(dtype): | ||
| dtype = {col_name: dtype for col_name in frame} | ||
| | @@ -1124,7 +1157,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
| if_exists=if_exists, index_label=index_label, | ||
| schema=schema, dtype=dtype) | ||
| table.create() | ||
| table.insert(chunksize) | ||
| table.insert(chunksize, method=method) | ||
| if (not name.isdigit() and not name.islower()): | ||
| # check for potentially case sensitivity issues (GH7815) | ||
| # Only check when name is not a number and name is not lower case | ||
| | @@ -1434,7 +1467,8 @@ def _fetchall_as_list(self, cur): | |
| return result | ||
| | ||
| def to_sql(self, frame, name, if_exists='fail', index=True, | ||
| index_label=None, schema=None, chunksize=None, dtype=None): | ||
| index_label=None, schema=None, chunksize=None, dtype=None, | ||
| method='default'): | ||
| """ | ||
| Write records stored in a DataFrame to a SQL database. | ||
| | ||
| | @@ -1463,7 +1497,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
| Optional specifying the datatype for columns. The SQL type should | ||
| be a string. If all columns are of the same type, one single value | ||
| can be used. | ||
| | ||
| method : {'default', 'multi', callable}, default 'default' | ||
| Controls the SQL insertion clause used. | ||
| """ | ||
| if dtype and not is_dict_like(dtype): | ||
| dtype = {col_name: dtype for col_name in frame} | ||
| | @@ -1478,7 +1513,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True, | |
| if_exists=if_exists, index_label=index_label, | ||
| dtype=dtype) | ||
| table.create() | ||
| table.insert(chunksize) | ||
| table.insert(chunksize, method) | ||
| | ||
| def has_table(self, name, schema=None): | ||
| # TODO(wesm): unused? | ||
| | ||
Uh oh!
There was an error while loading. Please reload this page.