Because my old work often handles 1 million lines of fixwidth data, I did research on this issue when I started using Python.
There are 2 types of FixedWidth
- ASCII FixedWidth (ascii character length = 1, double-byte encoded character length = 2)
- Unicode FixedWidth (ascii character & double-byte encoded character length = 1)
If the resource string is all composed of ascii characters, then ASCII FixedWidth = Unicode FixedWidth
Fortunately, string and byte are different in py3, which reduces a lot of confusion when dealing with double-byte encoded characters (e.g.gbk, big5, euc-jp, shift-jis, etc.).
For the processing of "ASCII FixedWidth", the String is usually converted to Bytes and then split.
Without importing third-party modules
totalLineCount = 1 million, lineLength = 800 byte , FixedWidthArgs=(10,25,4,....), I split the Line in about 5 ways and get the following conclusion:
- struct is the fastest (1x)
- Loop only, not pre-processing FixedWidthArgs is the slowest (5x+)
slice(bytes) is faster than slice(string) - The source string is the bytes test result: struct(1x) , operator.itemgetter(1.7x) , precompiled sliceObject & list comprehensions(2.8x), re.patten object (2.9x)
When dealing with large files, we often use with open ( file, "rb") as f:.
The method traverses one of the above files, about 2.4 second.
I think the appropriate handler, which processes 1 million rows of data, splits each row into 20 fields and takes less than 2.4 seconds.
I only find that stuct and itemgetter meet the requirements
ps: For normal display, I converted unicode str to bytes. If you are in a double-byte environment, you don't need to do this.
from itertools import accumulate from operator import itemgetter def oprt_parser(sArgs): sum_arg = tuple(accumulate(abs(i) for i in sArgs)) # Negative parameter field index cuts = tuple(i for i,num in enumerate(sArgs) if num < 0) # Get slice args and Ignore fields of negative length ig_Args = tuple(item for i, item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts) # Generate `operator.itemgetter` object oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args]) return oprtObj lineb = b'abcdefghijklmnopqrstuvwxyz\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4\xb6\xee\xb7\xa2\xb8\xf6\xba\xcd0123456789' line = lineb.decode("GBK") # Unicode Fixed Width fieldwidthsU = (13, -13, 4, -4, 5,-5) # Negative width fields is ignored # ASCII Fixed Width fieldwidths = (13, -13, 8, -8, 5,-5) # Negative width fields is ignored # Unicode FixedWidth processing parse = oprt_parser(fieldwidthsU) fields = parse(line) print('Unicode FixedWidth','fields: {}'.format(tuple(map(lambda s: s.encode("GBK"), fields)))) # ASCII FixedWidth processing parse = oprt_parser(fieldwidths) fields = parse(lineb) print('ASCII FixedWidth','fields: {}'.format(fields)) line = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n' fieldwidths = (2, -10, 24) parse = oprt_parser(fieldwidths) fields = parse(line) print(f"fields: {fields}")
Output:
Unicode FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234') ASCII FixedWidth fields: (b'abcdefghijklm', b'\xb0\xa1\xb2\xbb\xb4\xd3\xb5\xc4', b'01234') fields: ('AB', 'MNOPQRSTUVWXYZ0123456789')
oprt_parser is 4x make_parser(list comprehensions + slice)
During the research, it was found that when the cpu speed is faster, it seems that the efficiency of the re method increases faster.
Since I don't have more and better computers to test, provide my test code, if anyone is interested, you can test it with a faster computer.
Run Environment:
- os:win10
- python: 3.7.2
- CPU:amd athlon x3 450
- HD:seagate 1T
import timeit import time import re from itertools import accumulate from operator import itemgetter def eff2(stmt,onlyNum= False,showResult=False): '''test function''' if onlyNum: rl = timeit.repeat(stmt=stmt,repeat=roundI,number=timesI,globals=globals()) avg = sum(rl) / len(rl) return f"{avg * (10 ** 6)/timesI:0.4f}" else: rl = timeit.repeat(stmt=stmt,repeat=10,number=1000,globals=globals()) avg = sum(rl) / len(rl) print(f"【{stmt}】") print(f"\tquick avg = {avg * (10 ** 6)/1000:0.4f} s/million") if showResult: print(f"\t Result = {eval(stmt)}\n\t timelist = {rl}\n") else: print("") def upDouble(argList,argRate): return [c*argRate for c in argList] tbStr = "000000001111000002222真2233333333000000004444444QAZ55555555000000006666666ABC这些事中文字abcdefghijk" tbBytes = tbStr.encode("GBK") a20 = (4,4,2,2,2,3,2,2, 2 ,2,8,8,7,3,8,8,7,3, 12 ,11) a20U = (4,4,2,2,2,3,2,2, 1 ,2,8,8,7,3,8,8,7,3, 6 ,11) Slng = 800 rateS = Slng // 100 tStr = "".join(upDouble(tbStr , rateS)) tBytes = tStr.encode("GBK") spltArgs = upDouble( a20 , rateS) spltArgsU = upDouble( a20U , rateS) testList = [] timesI = 100000 roundI = 5 print(f"test round = {roundI} timesI = {timesI} sourceLng = {len(tStr)} argFieldCount = {len(spltArgs)}") print(f"pure str \n{''.ljust(60,'-')}") # ========================================== def str_parser(sArgs): def prsr(oStr): r = [] r_ap = r.append stt=0 for lng in sArgs: end = stt + lng r_ap(oStr[stt:end]) stt = end return tuple(r) return prsr Str_P = str_parser(spltArgsU) # eff2("Str_P(tStr)") testList.append("Str_P(tStr)") print(f"pure bytes \n{''.ljust(60,'-')}") # ========================================== def byte_parser(sArgs): def prsr(oBytes): r, stt = [], 0 r_ap = r.append for lng in sArgs: end = stt + lng r_ap(oBytes[stt:end]) stt = end return r return prsr Byte_P = byte_parser(spltArgs) # eff2("Byte_P(tBytes)") testList.append("Byte_P(tBytes)") # re,bytes print(f"re compile object \n{''.ljust(60,'-')}") # ========================================== def rebc_parser(sArgs,otype="b"): re_Args = "".join([f"(.{{{n}}})" for n in sArgs]) if otype == "b": rebc_Args = re.compile(re_Args.encode("GBK")) else: rebc_Args = re.compile(re_Args) def prsr(oBS): return rebc_Args.match(oBS).groups() return prsr Rebc_P = rebc_parser(spltArgs) # eff2("Rebc_P(tBytes)") testList.append("Rebc_P(tBytes)") Rebc_Ps = rebc_parser(spltArgsU,"s") # eff2("Rebc_Ps(tStr)") testList.append("Rebc_Ps(tStr)") print(f"struct \n{''.ljust(60,'-')}") # ========================================== import struct def struct_parser(sArgs): struct_Args = " ".join(map(lambda x: str(x) + "s", sArgs)) def prsr(oBytes): return struct.unpack(struct_Args, oBytes) return prsr Struct_P = struct_parser(spltArgs) # eff2("Struct_P(tBytes)") testList.append("Struct_P(tBytes)") print(f"List Comprehensions + slice \n{''.ljust(60,'-')}") # ========================================== import itertools def slice_parser(sArgs): tl = tuple(itertools.accumulate(sArgs)) slice_Args = tuple(zip((0,)+tl,tl)) def prsr(oBytes): return [oBytes[s:e] for s, e in slice_Args] return prsr Slice_P = slice_parser(spltArgs) # eff2("Slice_P(tBytes)") testList.append("Slice_P(tBytes)") def sliceObj_parser(sArgs): tl = tuple(itertools.accumulate(sArgs)) tl2 = tuple(zip((0,)+tl,tl)) sliceObj_Args = tuple(slice(s,e) for s,e in tl2) def prsr(oBytes): return [oBytes[so] for so in sliceObj_Args] return prsr SliceObj_P = sliceObj_parser(spltArgs) # eff2("SliceObj_P(tBytes)") testList.append("SliceObj_P(tBytes)") SliceObj_Ps = sliceObj_parser(spltArgsU) # eff2("SliceObj_Ps(tStr)") testList.append("SliceObj_Ps(tStr)") print(f"operator.itemgetter + slice object \n{''.ljust(60,'-')}") # ========================================== def oprt_parser(sArgs): sum_arg = tuple(accumulate(abs(i) for i in sArgs)) cuts = tuple(i for i,num in enumerate(sArgs) if num < 0) ig_Args = tuple(item for i,item in enumerate(zip((0,)+sum_arg,sum_arg)) if i not in cuts) oprtObj =itemgetter(*[slice(s,e) for s,e in ig_Args]) return oprtObj Oprt_P = oprt_parser(spltArgs) # eff2("Oprt_P(tBytes)") testList.append("Oprt_P(tBytes)") Oprt_Ps = oprt_parser(spltArgsU) # eff2("Oprt_Ps(tStr)") testList.append("Oprt_Ps(tStr)") print("|".join([s.split("(")[0].center(11," ") for s in testList])) print("|".join(["".center(11,"-") for s in testList])) print("|".join([eff2(s,True).rjust(11," ") for s in testList]))
Output:
Test round = 5 timesI = 100000 sourceLng = 744 argFieldCount = 20 ... ... Str_P | Byte_P | Rebc_P | Rebc_Ps | Struct_P | Slice_P | SliceObj_P|SliceObj_Ps| Oprt_P | Oprt_Ps -----------|-----------|-----------|-----------|-- ---------|-----------|-----------|-----------|---- -------|----------- 9.6315| 7.5952| 4.4187| 5.6867| 1.5123| 5.2915| 4.2673| 5.7121| 2.4713| 3.9051