the code you are using is ultra inefficient for a number of reasons as you are committing each of your data one row at a time (which would be what you want for a transactional DB or process) but not for a one-off dump.
There are a number of ways to speed this up ranging from great to not so great. Here are 4 approaches, including the naive implementation (above)
#!/usr/bin/env python import pandas as pd import numpy as np import odo import profilehooks import sqlalchemy import csv import os def create_test_data(): n = 100000 df = pd.DataFrame(dict( id=np.random.randint(0, 1000000, n), col1=np.random.choice(['hello', 'world', 'python', 'large string for testing ' * 10], n), col2=np.random.randint(-1000000, 1000000, n), col3=np.random.randint(-9000000, 9000000, n), col4=(np.random.random(n) - 0.5) * 99999 ), columns=['id', 'col1', 'col2', 'col3', 'col4']) df.to_csv('tmp.csv', index=False) @profilehooks.timecall def using_pandas(table_name, uri): df = pd.read_csv('tmp.csv') df.to_sql(table_name, con=uri, if_exists='append', index=False) @profilehooks.timecall def using_odo(table_name, uri): odo.odo('tmp.csv', '%s::%s' % (uri, table_name)) @profilehooks.timecall def using_cursor(table_name, uri): engine = sqlalchemy.create_engine(uri) query = 'INSERT INTO {} (id, col1, col2, col3, col4) VALUES(%s, %s, %s, %s, %s)' query = query.format(table_name) con = engine.raw_connection() with con.cursor() as cursor: with open('tmp.csv') as fh: reader = csv.reader(fh) next(reader) # Skip firt line (headers) for row in reader: cursor.execute(query, row) con.commit() con.close() @profilehooks.timecall def using_cursor_correct(table_name, uri): engine = sqlalchemy.create_engine(uri) query = 'INSERT INTO {} (id, col1, col2, col3, col4) VALUES(%s, %s, %s, %s, %s)' query = query.format(table_name) with open('tmp.csv') as fh: reader = csv.reader(fh) next(reader) # Skip firt line (headers) data = list(reader) engine.execute(query, data) def main(): uri = 'mysql+pymysql://root:%s@localhost/test' % os.environ['pass'] engine = sqlalchemy.create_engine(uri) for i in (1, 2, 3, 4): engine.execute("DROP TABLE IF EXISTS table%s" % i) engine.execute(""" CREATE TABLE table%s( id INT, col1 VARCHAR(255), col2 INT, col3 INT, col4 DOUBLE ); """ % i) create_test_data() using_odo('table1', uri) using_pandas('table4', uri) using_cursor_correct('table3', uri) using_cursor('table2', uri) for i in (1, 2, 3, 4): count = pd.read_sql('SELECT COUNT(*) as c FROM table%s' % i, con=uri)['c'][0] print("Count for table%s - %s" % (i, count)) if __name__ == '__main__': main()
The odo method is the fastest (uses MySQL LOAD DATA INFILE under the hood) Next is Pandas (critical code paths are optimized) Next is using a raw cursor but inserting rows in bulk Last is the naive method, committing one row at a time
Here are some examples timings running locally against a local MySQL server.
using_odo (./test.py:29): 0.516 seconds
using_pandas (./test.py:23): 3.039 seconds
using_cursor_correct (./test.py:50): 12.847 seconds
using_cursor (./test.py:34): 43.470 seconds
Count for table1 - 100000
Count for table2 - 100000
Count for table3 - 100000
Count for table4 - 100000
As you can see, the naive implementation is ~100 times slower than odo. And ~10 times slower than using pandas
LOAD DATA, which is optimized for bulk loading of data in MySQL.