Auto-detect field widths in read_fwf when unspecified · Issue #4488 · pandas-dev/pandas (original) (raw)

Here's a quick hack for generating the widths arg.
It's basic, but it works for me.

def detect_fwf_widths(rows,wsratio=0.5,nlines=100):
    from collections import Counter # 2.6, is this in pd.compat?

    # could do a p-value thing instead, but this works fine in practice
    nlines = min(len(rows),nlines) 

    # columwise, get occurence counts, use threshold to prune candidates
    # for "blank".
    counts = [{k:v for k,v in d.iteritems()  if v > nlines*wsratio} 
              for d in map(Counter,zip(*rows))]

    # among candidates, find the most common and choose it as "blank"
    # then extract only the cols where "blank" appears "frequently".
    c=Counter()
    for d in counts: 
        c.update(d)
    delim=c.most_common(1)[0][0]
    cols = [i for i,d in enumerate(counts) if delim in d]

    # filter the col indexes to keep only the rightmost index of
    # each contiguous sequence
    width_list =[]
    lidx=0
    prev = cols[0]
    for v in cols[1:]:
        if v-prev != 1:
            width_list.append(prev+1-lidx)
            lidx=prev+1
        prev = v
    width_list.append(len(rows[0])-lidx)

    return width_list