[Python-checkins] python/dist/src/Lib/email FeedParser.py,1.1,1.2 (original) (raw)

bwarsaw at users.sourceforge.net bwarsaw at users.sourceforge.net
Sat May 8 23:29:25 EDT 2004


Update of /cvsroot/python/python/dist/src/Lib/email In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11011/Lib/email

Modified Files: FeedParser.py Log Message: An updated FeedParser that should be RFC complaint, passes all existing (standard) tests, and doesn't throw parse errors. I still need throw Anthony's torture test at it, but I wanted to get this checked in and off my disk.

Index: FeedParser.py

          if lastheader:
              # XXX reconsider the joining of folded lines

! self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip() lastheader, lastvalue = '', [] ! ! # Check for Unix-From if line.startswith('From '): if lineno == 0: self._cur.set_unixfrom(line) continue ! elif lineno == len(headerlist) - 1: # Something looking like a unix-from at the end - it's ! # probably the first line of the body self._input.unreadline(line) return else: ! # Weirdly placed unix-from line. Ignore it. continue ! i = line.find(':') if i < 0: ! # The older parser had various special-cases here. We've ! # already handled them ! raise Errors.HeaderParseError( ! "Not a header, not a continuation: ``%s''" % line) lastheader = line[:i] lastvalue = [line[i+1:].lstrip()] ! if lastheader: # XXX reconsider the joining of folded lines self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()

- - def _parsegen(self): - # Parse any currently available text - self._new_sub_object() - self._root = self._cur - completing = False - last = None

! def close(self): ! """Parse all remaining data and return the root message object.""" ! self._input.close() ! self._call_parse() ! root = self._pop_message() ! assert not self._msgstack ! return root

! def _new_message(self): ! msg = self._factory() ! if self._cur and self._cur.get_content_type() == 'multipart/digest': ! msg.set_default_type('message/rfc822') ! if self._msgstack: ! self._msgstack[-1].attach(msg) ! self._msgstack.append(msg) ! self._cur = msg ! self._cur.defects = [] ! self._last = msg

! def _parsegen(self): ! # Create a new message and start by parsing headers. ! self._new_message() ! headers = [] ! # Collect the headers, searching for a line that doesn't match the RFC ! # 2822 header or continuation pattern (including an empty line). ! for line in self._input: ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! if not headerRE.match(line): ! # If we saw the RFC defined header/body separator ! # (i.e. newline), just throw it away. Otherwise the line is ! # part of the body so push it back. ! if not NLCRE.match(line): ! self._input.unreadline(line) ! break ! headers.append(line) ! # Done with the headers, so parse them and figure out what we're ! # supposed to see in the body of the message. ! self._parse_headers(headers) ! # Headers-only parsing is a backwards compatibility hack, which was ! # necessary in the older parser, which could throw errors. All ! # remaining lines in the input are thrown into the message body. ! if self._headersonly: ! lines = [] ! while True: ! line = self._input.readline() ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! if line == '': ! break ! lines.append(line) ! self._cur.set_payload(EMPTYSTRING.join(lines)) ! return ! # So now the input is sitting at the first body line. If the message ! # claims to be a message/rfc822 type, then what follows is another RFC ! # 2822 message. ! if self._cur.get_content_type() == 'message/rfc822': ! for retval in self._parsegen(): ! if retval is NeedMoreData: ! yield NeedMoreData ! continue ! break ! self._pop_message() ! return ! if self._cur.get_content_type() == 'message/delivery-status': ! # message/delivery-status contains blocks of headers separated by ! # a blank line. We'll represent each header block as a separate ! # nested message object. A blank line separates the subparts. ! while True: ! self._input.push_eof_matcher(NLCRE.match) ! for retval in self._parsegen(): ! if retval is NeedMoreData: ! yield NeedMoreData ! continue ! break ! msg = self._pop_message() ! # We need to pop the EOF matcher in order to tell if we're at ! # the end of the current file, not the end of the last block ! # of message headers. ! self._input.pop_eof_matcher() ! # The input stream must be sitting at the newline or at the ! # EOF. We want to see if we're at the end of this subpart, so ! # first consume the blank line, then test the next line to see ! # if we're at this subpart's EOF. ! line = self._input.readline() ! line = self._input.readline() ! if line == '': ! break ! # Not at EOF so this is a line we're going to need. ! self._input.unreadline(line) ! return ! if self._cur.get_content_maintype() == 'multipart': ! boundary = self._cur.get_boundary() ! if boundary is None: ! # The message /claims/ to be a multipart but it has not ! # defined a boundary. That's a problem which we'll handle by ! # reading everything until the EOF and marking the message as ! # defective. ! self._cur.defects.append(Errors.NoBoundaryInMultipart()) ! lines = [] ! for line in self._input: ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! lines.append(line) ! self._cur.set_payload(EMPTYSTRING.join(lines)) ! return ! # Create a line match predicate which matches the inter-part ! # boundary as well as the end-of-multipart boundary. Don't push ! # this onto the input stream until we've scanned past the ! # preamble. ! separator = '--' + boundary ! boundaryre = re.compile( ! '(?P' + re.escape(separator) + ! r')(?P--)?(?P[ \t]*)(?P\r\n|\r|\n)$') ! capturing_preamble = True ! preamble = [] ! linesep = False ! while True: ! line = self._input.readline() ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! if line == '': ! break ! mo = boundaryre.match(line) ! if mo: ! # If we're looking at the end boundary, we're done with ! # this multipart. If there was a newline at the end of ! # the closing boundary, then we need to initialize the ! # epilogue with the empty string (see below). ! if mo.group('end'): ! linesep = mo.group('linesep') ! break ! # We saw an inter-part boundary. Were we in the preamble? ! if capturing_preamble: ! if preamble: ! # According to RFC 2046, the last newline belongs ! # to the boundary. ! lastline = preamble[-1] ! eolmo = NLCRE_eol.search(lastline) ! if eolmo: ! preamble[-1] = lastline[:-len(eolmo.group(0))] ! self._cur.preamble = EMPTYSTRING.join(preamble) ! capturing_preamble = False ! self._input.unreadline(line) ! continue ! # We saw a boundary separating two parts. Recurse to ! # parse this subpart; the input stream points at the ! # subpart's first line. ! self._input.push_eof_matcher(boundaryre.match) ! for retval in self._parsegen(): ! if retval is NeedMoreData: ! yield NeedMoreData ! continue ! break ! # Because of RFC 2046, the newline preceding the boundary ! # separator actually belongs to the boundary, not the ! # previous subpart's payload (or epilogue if the previous ! # part is a multipart). ! if self._last.get_content_maintype() == 'multipart': ! epilogue = self._last.epilogue ! if epilogue == '': ! self._last.epilogue = None ! elif epilogue is not None: ! mo = NLCRE_eol.search(epilogue) ! if mo: ! end = len(mo.group(0)) ! self._last.epilogue = epilogue[:-end] ! else: ! payload = self._last.get_payload() ! if isinstance(payload, basestring): ! mo = NLCRE_eol.search(payload) ! if mo: ! payload = payload[:-len(mo.group(0))] ! self._last.set_payload(payload) ! self._input.pop_eof_matcher() ! self._pop_message() ! # Set the multipart up for newline cleansing, which will ! # happen if we're in a nested multipart. ! self._last = self._cur ! else: ! # I think we must be in the preamble ! assert capturing_preamble ! preamble.append(line) ! # We've seen either the EOF or the end boundary. If we're still ! # capturing the preamble, we never saw the start boundary. Note ! # that as a defect and store the captured text as the payload. ! # Otherwise everything from here to the EOF is epilogue. ! if capturing_preamble: ! self._cur.defects.append(Errors.StartBoundaryNotFound()) ! self._cur.set_payload(EMPTYSTRING.join(preamble)) ! return ! # If the end boundary ended in a newline, we'll need to make sure ! # the epilogue isn't None ! if linesep: ! epilogue = [''] ! else: ! epilogue = [] ! for line in self._input: ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! epilogue.append(line) ! # Any CRLF at the front of the epilogue is not technically part of ! # the epilogue. Also, watch out for an empty string epilogue, ! # which means a single newline. ! firstline = epilogue[0] ! bolmo = NLCRE_bol.match(firstline) ! if bolmo: ! epilogue[0] = firstline[len(bolmo.group(0)):] ! self._cur.epilogue = EMPTYSTRING.join(epilogue) ! return ! # Otherwise, it's some non-multipart type, so the entire rest of the ! # file contents becomes the payload. ! lines = [] ! for line in self._input: ! if line is NeedMoreData: ! yield NeedMoreData ! continue ! lines.append(line) ! self._cur.set_payload(EMPTYSTRING.join(lines)) ! ! def _parse_headers(self, lines): ! # Passed a list of lines that make up the headers for the current msg ! lastheader = '' ! lastvalue = [] ! for lineno, line in enumerate(lines): # Check for continuation if line[0] in ' \t': if not lastheader: ! # The first line of the headers was a continuation. This ! # is illegal, so let's note the defect, store the illegal ! # line, and ignore it for purposes of headers. ! defect = Errors.FirstHeaderLineIsContinuation(line) ! self._cur.defects.append(defect) ! continue lastvalue.append(line) continue if lastheader: # XXX reconsider the joining of folded lines ! self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1] lastheader, lastvalue = '', [] ! # Check for envelope header, i.e. unix-from if line.startswith('From '): if lineno == 0: self._cur.set_unixfrom(line) continue ! elif lineno == len(lines) - 1: # Something looking like a unix-from at the end - it's ! # probably the first line of the body, so push back the ! # line and stop. self._input.unreadline(line) return else: ! # Weirdly placed unix-from line. Note this as a defect ! # and ignore it. ! defect = Errors.MisplacedEnvelopeHeader(line) ! self._cur.defects.append(defect) continue ! # Split the line on the colon separating field name from value. i = line.find(':') if i < 0: ! defect = Errors.MalformedHeader(line) ! self._cur.defects.append(defect) ! continue lastheader = line[:i] lastvalue = [line[i+1:].lstrip()] ! # Done with all the lines, so handle the last header. if lastheader: # XXX reconsider the joining of folded lines self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()



More information about the Python-checkins mailing list