I'm testing the possibility of reading a large CSV file simultaneously with different processes using pandas and the Python multiprocessing module.
There are some time savings, but they're pretty minimal. At first I thought maybe it had to do with how fast data can be read from a hard drive, but I don't think that's why because reading a large text file or a large Python pickle file is much faster.
Please see code below:
import pandas as pd import numpy as np import time from datetime import datetime import multiprocessing as mp from util import memchk import re FILE_LENGTH = 1000000 INFILE = 'rtest.1mX80.csv' def single(): df = pd.read_csv(INFILE) return df def now(): currentTime = datetime.now() formattedTime = f"{currentTime.hour}:{currentTime.minute:02}:{currentTime.second:02}.{currentTime.microsecond/1000:03.0f}" return formattedTime def process_name(): fullName = f"{mp.current_process()}" name = re.search(r'PoolWorker-\d', fullName).group() return name def read_chunk(skiprows, nrows): print(f'({now()} | {process_name()}) Starting to read a chunk...') start = time.perf_counter() df = pd.read_csv(INFILE, skiprows=skiprows, nrows=nrows) end = time.perf_counter() print(f"({now()} | {process_name()}) Read the chunk in {end-start:.2f} seconds!") return df def parallel(): nrows = int(FILE_LENGTH/4) skiprows = [i * nrows for i in range(4)] starmapArgs = zip(skiprows, [nrows] * 4) with mp.Pool(4) as pool: dfs = pool.starmap(read_chunk, starmapArgs) df = pd.concat(dfs, sort=False) return df def gen_df(nrows, ncols): colnames = [f"col{i}" for i in range(1, ncols+1)] df = pd.DataFrame(np.random.rand(nrows, ncols), columns=colnames) return df if __name__ == "__main__": gen_df(FILE_LENGTH, 80).to_csv('rtest.1mX80.csv', index=False) start = time.perf_counter() df1 = single() end = time.perf_counter() print(f"Finished reading file (singleprocessing) in {end-start:.2f} seconds.") start = time.perf_counter() df2 = parallel() end = time.perf_counter() print(f"Finished reading file (multiprocessing) in {end-start:.2f} seconds.") What is the reason why multiprocessing is only marginally faster when reading large files in pandas? Is it some kind of hardware limitation, or is it more closely related to the pandas implementation of read_csv?