Python 3.4 includes a new module: tracemalloc. It provides detailed statistics about which code is allocating the most memory. Here's an example that displays the top three lines allocating memory.
from collections import Counter import linecache import os import tracemalloc def display_top(snapshot, key_type='lineno', limit=3): snapshot = snapshot.filter_traces(( tracemalloc.Filter(False, "<frozen importlib._bootstrap>"), tracemalloc.Filter(False, "<unknown>"), )) top_stats = snapshot.statistics(key_type) print("Top %s lines" % limit) for index, stat in enumerate(top_stats[:limit], 1): frame = stat.traceback[0] # replace "/path/to/module/file.py" with "module/file.py" filename = os.sep.join(frame.filename.split(os.sep)[-2:]) print("#%s: %s:%s: %.1f KiB" % (index, filename, frame.lineno, stat.size / 1024)) line = linecache.getline(frame.filename, frame.lineno).strip() if line: print(' %s' % line) other = top_stats[limit:] if other: size = sum(stat.size for stat in other) print("%s other: %.1f KiB" % (len(other), size / 1024)) total = sum(stat.size for stat in top_stats) print("Total allocated size: %.1f KiB" % (total / 1024)) tracemalloc.start() counts = Counter() fname = '/usr/share/dict/american-english' with open(fname) as words: words = list(words) for word in words: prefix = word[:3] counts[prefix] += 1 print('Top prefixes:', counts.most_common(3)) snapshot = tracemalloc.take_snapshot() display_top(snapshot)
And here are the results:
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)] Top 3 lines #1: scratches/memory_test.py:37: 6527.1 KiB words = list(words) #2: scratches/memory_test.py:39: 247.7 KiB prefix = word[:3] #3: scratches/memory_test.py:40: 193.0 KiB counts[prefix] += 1 4 other: 4.3 KiB Total allocated size: 6972.1 KiB
When is a memory leak not a leak?
That example is great when the memory is still being held at the end of the calculation, but sometimes you have code that allocates a lot of memory and then releases it all. It's not technically a memory leak, but it's using more memory than you think it should. How can you track memory usage when it all gets released? If it's your code, you can probably add some debugging code to take snapshots while it's running. If not, you can start a background thread to monitor memory usage while the main thread runs.
Here's the previous example where the code has all been moved into the count_prefixes() function. When that function returns, all the memory is released. I also added some sleep() calls to simulate a long-running calculation.
from collections import Counter import linecache import os import tracemalloc from time import sleep def count_prefixes(): sleep(2) # Start up time. counts = Counter() fname = '/usr/share/dict/american-english' with open(fname) as words: words = list(words) for word in words: prefix = word[:3] counts[prefix] += 1 sleep(0.0001) most_common = counts.most_common(3) sleep(3) # Shut down time. return most_common def main(): tracemalloc.start() most_common = count_prefixes() print('Top prefixes:', most_common) snapshot = tracemalloc.take_snapshot() display_top(snapshot) def display_top(snapshot, key_type='lineno', limit=3): snapshot = snapshot.filter_traces(( tracemalloc.Filter(False, "<frozen importlib._bootstrap>"), tracemalloc.Filter(False, "<unknown>"), )) top_stats = snapshot.statistics(key_type) print("Top %s lines" % limit) for index, stat in enumerate(top_stats[:limit], 1): frame = stat.traceback[0] # replace "/path/to/module/file.py" with "module/file.py" filename = os.sep.join(frame.filename.split(os.sep)[-2:]) print("#%s: %s:%s: %.1f KiB" % (index, filename, frame.lineno, stat.size / 1024)) line = linecache.getline(frame.filename, frame.lineno).strip() if line: print(' %s' % line) other = top_stats[limit:] if other: size = sum(stat.size for stat in other) print("%s other: %.1f KiB" % (len(other), size / 1024)) total = sum(stat.size for stat in top_stats) print("Total allocated size: %.1f KiB" % (total / 1024)) main()
When I run that version, the memory usage has gone from 6MB down to 4KB, because the function released all its memory when it finished.
Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)] Top 3 lines #1: collections/__init__.py:537: 0.7 KiB self.update(*args, **kwds) #2: collections/__init__.py:555: 0.6 KiB return _heapq.nlargest(n, self.items(), key=_itemgetter(1)) #3: python3.6/heapq.py:569: 0.5 KiB result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)] 10 other: 2.2 KiB Total allocated size: 4.0 KiB
Now here's a version inspired by another answer that starts a second thread to monitor memory usage.
from collections import Counter import linecache import os import tracemalloc from datetime import datetime from queue import Queue, Empty from resource import getrusage, RUSAGE_SELF from threading import Thread from time import sleep def memory_monitor(command_queue: Queue, poll_interval=1): tracemalloc.start() old_max = 0 snapshot = None while True: try: command_queue.get(timeout=poll_interval) if snapshot is not None: print(datetime.now()) display_top(snapshot) return except Empty: max_rss = getrusage(RUSAGE_SELF).ru_maxrss if max_rss > old_max: old_max = max_rss snapshot = tracemalloc.take_snapshot() print(datetime.now(), 'max RSS', max_rss) def count_prefixes(): sleep(2) # Start up time. counts = Counter() fname = '/usr/share/dict/american-english' with open(fname) as words: words = list(words) for word in words: prefix = word[:3] counts[prefix] += 1 sleep(0.0001) most_common = counts.most_common(3) sleep(3) # Shut down time. return most_common def main(): queue = Queue() poll_interval = 0.1 monitor_thread = Thread(target=memory_monitor, args=(queue, poll_interval)) monitor_thread.start() try: most_common = count_prefixes() print('Top prefixes:', most_common) finally: queue.put('stop') monitor_thread.join() def display_top(snapshot, key_type='lineno', limit=3): snapshot = snapshot.filter_traces(( tracemalloc.Filter(False, "<frozen importlib._bootstrap>"), tracemalloc.Filter(False, "<unknown>"), )) top_stats = snapshot.statistics(key_type) print("Top %s lines" % limit) for index, stat in enumerate(top_stats[:limit], 1): frame = stat.traceback[0] # replace "/path/to/module/file.py" with "module/file.py" filename = os.sep.join(frame.filename.split(os.sep)[-2:]) print("#%s: %s:%s: %.1f KiB" % (index, filename, frame.lineno, stat.size / 1024)) line = linecache.getline(frame.filename, frame.lineno).strip() if line: print(' %s' % line) other = top_stats[limit:] if other: size = sum(stat.size for stat in other) print("%s other: %.1f KiB" % (len(other), size / 1024)) total = sum(stat.size for stat in top_stats) print("Total allocated size: %.1f KiB" % (total / 1024)) main()
The resource module lets you check the current memory usage, and save the snapshot from the peak memory usage. The queue lets the main thread tell the memory monitor thread when to print its report and shut down. When it runs, it shows the memory being used by the list() call:
2018-05-29 10:34:34.441334 max RSS 10188 2018-05-29 10:34:36.475707 max RSS 23588 2018-05-29 10:34:36.616524 max RSS 38104 2018-05-29 10:34:36.772978 max RSS 45924 2018-05-29 10:34:36.929688 max RSS 46824 2018-05-29 10:34:37.087554 max RSS 46852 Top prefixes: [('con', 1220), ('dis', 1002), ('pro', 809)] 2018-05-29 10:34:56.281262 Top 3 lines #1: scratches/scratch.py:36: 6527.0 KiB words = list(words) #2: scratches/scratch.py:38: 16.4 KiB prefix = word[:3] #3: scratches/scratch.py:39: 10.1 KiB counts[prefix] += 1 19 other: 10.8 KiB Total allocated size: 6564.3 KiB
If you're on Linux, you may find /proc/self/statm more useful than the resource module.