@@ -5576,7 +5576,7 @@ Performance considerations
55765576--------------------------
55775577
55785578This is an informal comparison of various IO methods, using pandas
5579- 0.20.3 . Timings are machine dependent and small differences should be
5579+ 0.24.2 . Timings are machine dependent and small differences should be
55805580ignored.
55815581
55825582.. code-block :: ipython
@@ -5597,11 +5597,18 @@ Given the next test set:
55975597
55985598.. code-block :: python
55995599
5600+
5601+
5602+ import numpy as np
5603+
56005604 import os
56015605
56025606 sz = 1000000
56035607 df = pd.DataFrame({' A' : np.random.randn(sz), ' B' : [1 ] * sz})
56045608
5609+ sz = 1000000
5610+ np.random.seed(42 )
5611+ df = pd.DataFrame({' A' : np.random.randn(sz), ' B' : [1 ] * sz})
56055612
56065613 def test_sql_write (df ):
56075614 if os.path.exists(' test.sql' ):
@@ -5610,151 +5617,152 @@ Given the next test set:
56105617 df.to_sql(name = ' test_table' , con = sql_db)
56115618 sql_db.close()
56125619
5613-
56145620 def test_sql_read ():
56155621 sql_db = sqlite3.connect(' test.sql' )
56165622 pd.read_sql_query(" select * from test_table" , sql_db)
56175623 sql_db.close()
56185624
5619-
56205625 def test_hdf_fixed_write (df ):
56215626 df.to_hdf(' test_fixed.hdf' , ' test' , mode = ' w' )
56225627
5623-
56245628 def test_hdf_fixed_read ():
56255629 pd.read_hdf(' test_fixed.hdf' , ' test' )
56265630
5627-
56285631 def test_hdf_fixed_write_compress (df ):
56295632 df.to_hdf(' test_fixed_compress.hdf' , ' test' , mode = ' w' , complib = ' blosc' )
56305633
5631-
56325634 def test_hdf_fixed_read_compress ():
56335635 pd.read_hdf(' test_fixed_compress.hdf' , ' test' )
56345636
5635-
56365637 def test_hdf_table_write (df ):
56375638 df.to_hdf(' test_table.hdf' , ' test' , mode = ' w' , format = ' table' )
56385639
5639-
56405640 def test_hdf_table_read ():
56415641 pd.read_hdf(' test_table.hdf' , ' test' )
56425642
5643-
56445643 def test_hdf_table_write_compress (df ):
56455644 df.to_hdf(' test_table_compress.hdf' , ' test' , mode = ' w' ,
56465645 complib = ' blosc' , format = ' table' )
56475646
5648-
56495647 def test_hdf_table_read_compress ():
56505648 pd.read_hdf(' test_table_compress.hdf' , ' test' )
56515649
5652-
56535650 def test_csv_write (df ):
56545651 df.to_csv(' test.csv' , mode = ' w' )
56555652
5656-
56575653 def test_csv_read ():
56585654 pd.read_csv(' test.csv' , index_col = 0 )
56595655
5660-
56615656 def test_feather_write (df ):
56625657 df.to_feather(' test.feather' )
56635658
5664-
56655659 def test_feather_read ():
56665660 pd.read_feather(' test.feather' )
56675661
5668-
56695662 def test_pickle_write (df ):
56705663 df.to_pickle(' test.pkl' )
56715664
5672-
56735665 def test_pickle_read ():
56745666 pd.read_pickle(' test.pkl' )
56755667
5676-
56775668 def test_pickle_write_compress (df ):
56785669 df.to_pickle(' test.pkl.compress' , compression = ' xz' )
56795670
5680-
56815671 def test_pickle_read_compress ():
56825672 pd.read_pickle(' test.pkl.compress' , compression = ' xz' )
56835673
5684- When writing, the top-three functions in terms of speed are are
5685- ``test_pickle_write ``, ``test_feather_write `` and ``test_hdf_fixed_write_compress ``.
5674+ def test_parquet_write (df ):
5675+ df.to_parquet(' test.parquet' )
5676+
5677+ def test_parquet_read ():
5678+ pd.read_parquet(' test.parquet' )
5679+
5680+ When writing, the top-three functions in terms of speed are ``test_feather_write ``, ``test_hdf_fixed_write `` and ``test_hdf_fixed_write_compress ``.
56865681
56875682.. code-block :: ipython
56885683
5689- In [14 ]: %timeit test_sql_write(df)
5690- 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5684+ In [4 ]: %timeit test_sql_write(df)
5685+ 3.29 s ± 43.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
56915686
5692- In [15 ]: %timeit test_hdf_fixed_write(df)
5693- 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5687+ In [5 ]: %timeit test_hdf_fixed_write(df)
5688+ 19.4 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
56945689
5695- In [26 ]: %timeit test_hdf_fixed_write_compress(df)
5696- 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5690+ In [6 ]: %timeit test_hdf_fixed_write_compress(df)
5691+ 19.6 ms ± 308 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
56975692
5698- In [16 ]: %timeit test_hdf_table_write(df)
5699- 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5693+ In [7 ]: %timeit test_hdf_table_write(df)
5694+ 449 ms ± 5.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57005695
5701- In [27 ]: %timeit test_hdf_table_write_compress(df)
5702- 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5696+ In [8 ]: %timeit test_hdf_table_write_compress(df)
5697+ 448 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57035698
5704- In [17 ]: %timeit test_csv_write(df)
5705- 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5699+ In [9 ]: %timeit test_csv_write(df)
5700+ 3.66 s ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57065701
5707- In [30 ]: %timeit test_feather_write(df)
5708- 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5702+ In [10 ]: %timeit test_feather_write(df)
5703+ 9.75 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57095704
5710- In [31 ]: %timeit test_pickle_write(df)
5711- 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5705+ In [11 ]: %timeit test_pickle_write(df)
5706+ 30.1 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57125707
5713- In [32]: %timeit test_pickle_write_compress(df)
5714- 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5708+ In [12]: %timeit test_pickle_write_compress(df)
5709+ 4.29 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5710+
5711+ In [13]: %timeit test_parquet_write(df)
5712+ 67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57155713
57165714 When reading, the top three are ``test_feather_read ``, ``test_pickle_read `` and
57175715``test_hdf_fixed_read ``.
57185716
5717+
57195718.. code-block :: ipython
57205719
5721- In [18]: %timeit test_sql_read()
5722- 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5720+ In [14]: %timeit test_sql_read()
5721+ 1.77 s ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5722+
5723+ In [15]: %timeit test_hdf_fixed_read()
5724+ 19.4 ms ± 436 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5725+
5726+ In [16]: %timeit test_hdf_fixed_read_compress()
5727+ 19.5 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57235728
5724- In [19 ]: %timeit test_hdf_fixed_read ()
5725- 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5729+ In [17 ]: %timeit test_hdf_table_read ()
5730+ 38.6 ms ± 857 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57265731
5727- In [28 ]: %timeit test_hdf_fixed_read_compress ()
5728- 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5732+ In [18 ]: %timeit test_hdf_table_read_compress ()
5733+ 38.8 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
57295734
5730- In [20 ]: %timeit test_hdf_table_read ()
5731- 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
5735+ In [19 ]: %timeit test_csv_read ()
5736+ 452 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57325737
5733- In [29 ]: %timeit test_hdf_table_read_compress ()
5734- 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
5738+ In [20 ]: %timeit test_feather_read ()
5739+ 12.4 ms ± 99.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57355740
5736- In [22 ]: %timeit test_csv_read ()
5737- 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
5741+ In [21 ]: %timeit test_pickle_read ()
5742+ 18.4 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
57385743
5739- In [33 ]: %timeit test_feather_read ()
5740- 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5744+ In [22 ]: %timeit test_pickle_read_compress ()
5745+ 915 ms ± 7.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57415746
5742- In [34 ]: %timeit test_pickle_read ()
5743- 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5747+ In [23 ]: %timeit test_parquet_read ()
5748+ 24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
57445749
5745- In [35]: %timeit test_pickle_read_compress()
5746- 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
57475750
5751+ For this test case ``test.pkl.compress ``, ``test.parquet `` and ``test.feather `` took the least space on disk.
57485752Space on disk (in bytes)
57495753
57505754.. code-block :: none
57515755
5752- 34816000 Aug 21 18:00 test.sql
5753- 24009240 Aug 21 18:00 test_fixed.hdf
5754- 7919610 Aug 21 18:00 test_fixed_compress.hdf
5755- 24458892 Aug 21 18:00 test_table.hdf
5756- 8657116 Aug 21 18:00 test_table_compress.hdf
5757- 28520770 Aug 21 18:00 test.csv
5758- 16000248 Aug 21 18:00 test.feather
5759- 16000848 Aug 21 18:00 test.pkl
5760- 7554108 Aug 21 18:00 test.pkl.compress
5756+ 29519500 Oct 10 06:45 test.csv
5757+ 16000248 Oct 10 06:45 test.feather
5758+ 8281983 Oct 10 06:49 test.parquet
5759+ 16000857 Oct 10 06:47 test.pkl
5760+ 7552144 Oct 10 06:48 test.pkl.compress
5761+ 34816000 Oct 10 06:42 test.sql
5762+ 24009288 Oct 10 06:43 test_fixed.hdf
5763+ 24009288 Oct 10 06:43 test_fixed_compress.hdf
5764+ 24458940 Oct 10 06:44 test_table.hdf
5765+ 24458940 Oct 10 06:44 test_table_compress.hdf
5766+
5767+
5768+
0 commit comments