Issue 1452246: htmllib doesn't properly substitute entities (original) (raw)

I'd like to illustrate and suggest a fix by showing a simple python file (which was named htmllib2.py so you can uncomment the line in the doctest case to see that my fix works). It's more like a hack than the fix though: #!/usr/bin/env python2.4

""" Use this instead of htmllib for having entitydefs substituted in attributes,too.

Example:

import htmllib

>>> import htmllib2 as htmllib

import formatter import StringIO s = StringIO.StringIO() p = htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s))) p.feed('<>&') s.getvalue() '<>&' """

all = ("HTMLParser",)

import htmllib from htmlentitydefs import name2codepoint as entitytable

entitytable = dict([(k, chr(v)) for k, v in entitytable.items() if v < 256])

def entitysub(s): ret = "" state = "" for c in s: if state.startswith('&'): if c == ';': ret += entitytable.get(state[1:], '%s;' % state) state = "" else: state += c elif c == '&': state = c else: ret += c return ret

class HTMLParser(htmllib.HTMLParser): def handle_starttag(self, tag, method, attrs): """Repair attribute values.""" attrs = [(k, entitysub(v)) for (k, v) in attrs] method(attrs)

if name == 'main': import doctest doctest.testmod()