This seems to be the fastest solution I could come up with, and is faster than os.walk and a lot faster than any glob solution.
- It will also give you a list of all nested subfolders at basically no cost.
- You can search for several different extensions.
- You can also choose to return either full paths or just the names for the files by changing
f.path to f.name (do not change it for subfolders!).
Args: dir: str, ext: list.
Function returns two lists: subfolders, files.
See below for a detailed speed anaylsis.
def run_fast_scandir(dir, ext): # dir: str, ext: list subfolders, files = [], [] for f in os.scandir(dir): if f.is_dir(): subfolders.append(f.path) if f.is_file(): if os.path.splitext(f.name)[1].lower() in ext: files.append(f.path) for dir in list(subfolders): sf, f = run_fast_scandir(dir, ext) subfolders.extend(sf) files.extend(f) return subfolders, files subfolders, files = run_fast_scandir(folder, [".jpg"])
In case you need the file size, you can also create a sizes list and add f.stat().st_size like this for a display of MiB:
sizes.append(f"{f.stat().st_size/1024/1024:.0f} MiB")
Speed analysis
for various methods to get all files with a specific file extension inside all subfolders and the main folder.
tl;dr:
fast_scandir clearly wins and is twice as fast as all other solutions, except os.walk. os.walk is second place slighly slower. - using
glob will greatly slow down the process. - None of the results use natural sorting. This means results will be sorted like this: 1, 10, 2. To get natural sorting (1, 2, 10), please have a look at:
Results:
fast_scandir took 499 ms. Found files: 16596. Found subfolders: 439 os.walk took 589 ms. Found files: 16596 find_files took 919 ms. Found files: 16596 glob.iglob took 998 ms. Found files: 16596 glob.glob took 1002 ms. Found files: 16596 pathlib.rglob took 1041 ms. Found files: 16596 os.walk-glob took 1043 ms. Found files: 16596
Updated: 2022-07-20 (Py 3.10.1 looking for *.pdf)
glob.iglob took 132 ms. Found files: 9999 glob.glob took 134 ms. Found files: 9999 fast_scandir took 331 ms. Found files: 9999. Found subfolders: 9330 os.walk took 695 ms. Found files: 9999 pathlib.rglob took 828 ms. Found files: 9999 find_files took 949 ms. Found files: 9999 os.walk-glob took 1242 ms. Found files: 9999
Tests were done with W7x64, Python 3.8.1, 20 runs. 16596 files in 439 (partially nested) subfolders.
find_files is from https://stackoverflow.com/a/45646357/2441026 and lets you search for several extensions.
fast_scandir was written by myself and will also return a list of subfolders. You can give it a list of extensions to search for (I tested a list with one entry to a simple if ... == ".jpg" and there was no significant difference).
# -*- coding: utf-8 -*- # Python 3 import time import os from glob import glob, iglob from pathlib import Path directory = r"<folder>" RUNS = 20 def run_os_walk(): a = time.time_ns() for i in range(RUNS): fu = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1].lower() == '.jpg'] print(f"os.walk\t\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}") def run_os_walk_glob(): a = time.time_ns() for i in range(RUNS): fu = [y for x in os.walk(directory) for y in glob(os.path.join(x[0], '*.jpg'))] print(f"os.walk-glob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}") def run_glob(): a = time.time_ns() for i in range(RUNS): fu = glob(os.path.join(directory, '**', '*.jpg'), recursive=True) print(f"glob.glob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}") def run_iglob(): a = time.time_ns() for i in range(RUNS): fu = list(iglob(os.path.join(directory, '**', '*.jpg'), recursive=True)) print(f"glob.iglob\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}") def run_pathlib_rglob(): a = time.time_ns() for i in range(RUNS): fu = list(Path(directory).rglob("*.jpg")) print(f"pathlib.rglob\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(fu)}") def find_files(files, dirs=[], extensions=[]): # https://stackoverflow.com/a/45646357/2441026 new_dirs = [] for d in dirs: try: new_dirs += [ os.path.join(d, f) for f in os.listdir(d) ] except OSError: if os.path.splitext(d)[1].lower() in extensions: files.append(d) if new_dirs: find_files(files, new_dirs, extensions ) else: return def run_fast_scandir(dir, ext): # dir: str, ext: list # https://stackoverflow.com/a/59803793/2441026 subfolders, files = [], [] for f in os.scandir(dir): if f.is_dir(): subfolders.append(f.path) if f.is_file(): if os.path.splitext(f.name)[1].lower() in ext: files.append(f.path) for dir in list(subfolders): sf, f = run_fast_scandir(dir, ext) subfolders.extend(sf) files.extend(f) return subfolders, files if __name__ == '__main__': run_os_walk() run_os_walk_glob() run_glob() run_iglob() run_pathlib_rglob() a = time.time_ns() for i in range(RUNS): files = [] find_files(files, dirs=[directory], extensions=[".jpg"]) print(f"find_files\t\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}") a = time.time_ns() for i in range(RUNS): subf, files = run_fast_scandir(directory, [".jpg"]) print(f"fast_scandir\ttook {(time.time_ns() - a) / 1000 / 1000 / RUNS:.0f} ms. Found files: {len(files)}. Found subfolders: {len(subf)}")