Auto-detect field widths in read_fwf when unspecified · Issue #4488 · pandas-dev/pandas (original) (raw)
Here's a quick hack for generating the widths
arg.
It's basic, but it works for me.
def detect_fwf_widths(rows,wsratio=0.5,nlines=100):
from collections import Counter # 2.6, is this in pd.compat?
# could do a p-value thing instead, but this works fine in practice
nlines = min(len(rows),nlines)
# columwise, get occurence counts, use threshold to prune candidates
# for "blank".
counts = [{k:v for k,v in d.iteritems() if v > nlines*wsratio}
for d in map(Counter,zip(*rows))]
# among candidates, find the most common and choose it as "blank"
# then extract only the cols where "blank" appears "frequently".
c=Counter()
for d in counts:
c.update(d)
delim=c.most_common(1)[0][0]
cols = [i for i,d in enumerate(counts) if delim in d]
# filter the col indexes to keep only the rightmost index of
# each contiguous sequence
width_list =[]
lidx=0
prev = cols[0]
for v in cols[1:]:
if v-prev != 1:
width_list.append(prev+1-lidx)
lidx=prev+1
prev = v
width_list.append(len(rows[0])-lidx)
return width_list