You can write a wrapper for OpenSSL's MD5() function that accepts NumPy arrays. Our baseline will be a pure Python implementation.
Create a builder
# build.py import cffi ffi = cffi.FFI() header = r""" void md5_array(uint64_t* buffer, int len, unsigned char* out); """ source = r""" #include <stdint.h> #include <openssl/md5.h> void md5_array(uint64_t * buffer, int len, unsigned char * out) { int i = 0; for(i=0; i<len; i++) { MD5((const unsigned char *) &buffer[i], 8, out + i*16); } } """ ffi.set_source("_md5", source, libraries=['ssl']) ffi.cdef(header) if __name__ == "__main__": ffi.compile()
and a wrapper
# md5.py import numpy as np import _md5 def md5_array(data): out = np.zeros(data.shape, dtype='|S16') _md5.lib.md5_array( _md5.ffi.from_buffer(data), data.size, _md5.ffi.cast("unsigned char *", _md5.ffi.from_buffer(out)) ) return out
and a script compare the two:
# run.py import numpy as np import hashlib import md5 data = np.arange(16, dtype=np.uint64) out = [hashlib.md5(i).digest() for i in data] out2 = md5.md5_array(data) print(data) # [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] print(out) # [b'}\xea6+?\xac\x8e\x00\x95jIR\xa3\xd4\xf4t', ... , b'w)\r\xf2^\x84\x11w\xbb\xa1\x94\xc1\x8c8XS'] print(out2) # [b'}\xea6+?\xac\x8e\x00\x95jIR\xa3\xd4\xf4t', ... , b'w)\r\xf2^\x84\x11w\xbb\xa1\x94\xc1\x8c8XS'] print(all(out == out2)) # True
To compile the bindings and run the script, run
python build.py python run.py
For large arrays it's about 15x faster (I am a bit disappointed by that honestly...)
data = np.arange(100000, dtype=np.uint64) %timeit [hashlib.md5(i).digest() for i in data] 169 ms ± 3.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) %timeit md5.md5_array(data) 12.1 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
If you want to put the bindings in a library and compile them at install time, put the following in your setup.py:
setup( ..., setup_requires=["cffi>=1.0.0"], cffi_modules=["package/build.py:ffi"], install_requires=["cffi>=1.0.0"], )