Issue 1452246: htmllib doesn't properly substitute entities (original) (raw)
I'd like to illustrate and suggest a fix by showing a simple python file (which was named htmllib2.py so you can uncomment the line in the doctest case to see that my fix works). It's more like a hack than the fix though: #!/usr/bin/env python2.4
""" Use this instead of htmllib for having entitydefs substituted in attributes,too.
Example:
import htmllib
>>> import htmllib2 as htmllib
import formatter import StringIO s = StringIO.StringIO() p = htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s))) p.feed('
') s.getvalue() '<>&' """
all = ("HTMLParser",)
import htmllib from htmlentitydefs import name2codepoint as entitytable
entitytable = dict([(k, chr(v)) for k, v in entitytable.items() if v < 256])
def entitysub(s): ret = "" state = "" for c in s: if state.startswith('&'): if c == ';': ret += entitytable.get(state[1:], '%s;' % state) state = "" else: state += c elif c == '&': state = c else: ret += c return ret
class HTMLParser(htmllib.HTMLParser): def handle_starttag(self, tag, method, attrs): """Repair attribute values.""" attrs = [(k, entitysub(v)) for (k, v) in attrs] method(attrs)
if name == 'main': import doctest doctest.testmod()