Issue 849097: Request: getpos() for sgmllib (original) (raw)

During the process of making my masters thesis I discovered the need for a working getpos() in sgmllib.py. As it is now you can successfully call it since it is inherited from markupbase.py but you will always get the answer (1,0) since it is never updated.

To fix this one needs to change the goahead function. This is my own implementation of this change, in part influenced by the "sister" goahead-function in HTLMParser.py:

def goahead(self, end): rawdata = self.rawdata i = 0 k = 0 n = len(rawdata) tmp=0 while i < n: if self.nomoretags: self.handle_data(rawdata[i:n]) i = n break match = interesting.search(rawdata, i) if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) tmp = self.updatepos(i, j) i = j if i == n: break startswith = rawdata.startswith if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: self.handle_data(rawdata[i]) tmp = self.updatepos(i, i+1) i = i+1 continue k = self.parse_starttag(i) if k < 0: break tmp = self.updatepos(i, k) i = k continue if rawdata.startswith("</", i): k = self.parse_endtag(i) if k < 0: break tmp = self.updatepos(i, k) i = k self.literal = 0 continue if self.literal: if n > (i + 1): self.handle_data("<") i = i+1 tmp = self.updatepos(i, k) else: # incomplete break continue if rawdata.startswith("<!--", i): # Strictly speaking, a comment is --.*-- # within a declaration tag <!...>. # This should be removed, # and comments handled only in parse_declaration. k = self.parse_comment(i)

                if k < 0: break
                tmp = self.updatepos(i, k)
                i = k

                continue
            if rawdata.startswith("<?", i):
                k = self.parse_pi(i)
                if k < 0: break
                tmp = self.updatepos(i, k)
                i = i+k
                continue
            if rawdata.startswith("<!", i):
                # This is some sort of declaration;

in "HTML as # deployed," this should only be the document type # declaration (""). k = self.parse_declaration(i) if k < 0: break tmp = self.updatepos(i, k) i = k continue tmp = self.updatepos(i, k) elif rawdata[i] == '&':

            if self.literal:
                self.handle_data(rawdata[i])
                #tmp = self.updatepos(i,i+1)#added
                i = i+1
                continue
            match = charref.match(rawdata, i)
            if match:
                name = match.group()[2:-1]
                self.handle_charref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                tmp = self.updatepos(i, k)
                i = k
                continue
            match = entityref.match(rawdata, i)
            if match:
                name = match.group(1)
                self.handle_entityref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                tmp = self.updatepos(i, k)
                i = k
                continue
            
        else:
            self.error('neither < nor & ??')
        # We get here only if incomplete matches but
        # nothing else
        match = incomplete.match(rawdata, i)
        if not match:
            self.handle_data(rawdata[i])
            i = i+1
            continue
        j = match.end(0)
        if j == n:
            break # Really incomplete
        self.handle_data(rawdata[i:j])

        i = j

        
    # end while
    if end and i < n:
        self.handle_data(rawdata[i:n])
        tmp = self.updatepos(i, n)
        i = n
    self.rawdata = rawdata[i:]
    # XXX if end: check for empty stack

# Extensions for the DOCTYPE scanner:
_decl_otherchars = '='

The major diffrence is the updatepos functions. It seems to work fine, or at least it has worked fine for me so far.