[Python-checkins] python/dist/src/Lib/email Parser.py,1.21,1.22 (original) (raw)
bwarsaw at users.sourceforge.net bwarsaw at users.sourceforge.net
Sat May 8 23:46:44 EDT 2004
- Previous message: [Python-checkins] python/dist/src/Lib/email Message.py,1.35,1.36
- Next message: [Python-checkins] python/dist/src/Lib/email Utils.py,1.25,1.26
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Update of /cvsroot/python/python/dist/src/Lib/email In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13873
Modified Files: Parser.py Log Message: Update to Python 2.3, getting rid of backward compatiblity crud.
This Parser is now just a backward compatible front-end to the FeedParser.
Index: Parser.py
RCS file: /cvsroot/python/python/dist/src/Lib/email/Parser.py,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** Parser.py 20 Mar 2004 17:31:29 -0000 1.21
--- Parser.py 9 May 2004 03:46:42 -0000 1.22
***************
*** 1,99 ****
! # Copyright (C) 2001,2002 Python Software Foundation
! # Author: barry at zope.com (Barry Warsaw)
! """A parser of RFC 2822 and MIME email messages.
! """
import re
from cStringIO import StringIO
! from types import ListType
!
! from email import Errors
! from email import Message
!
! EMPTYSTRING = ''
! NL = '\n'
!
! try:
! True, False
! except NameError:
! True = 1
! False = 0
NLCRE = re.compile('\r\n|\r|\n')
- class TextUtil:
- """ A utility class for wrapping a file object and providing a
- couple of additional useful functions.
- """
def __init__(self, fp):
self.fp = fp
self.unread = []
def readline(self):
""" Return a line of data.
If data has been pushed back with unreadline(), the most recently
returned unreadline()d data will be returned.
"""
if self.unread:
return self.unread.pop()
else:
return self.fp.readline()
def unreadline(self, line):
"""Push a line back into the object.
"""
self.unread.append(line)
def peekline(self):
"""Non-destructively look at the next line"""
line = self.readline()
self.unreadline(line)
return line
def read(self):
"""Return the remaining data
"""
r = self.fp.read()
if self.unread:
r = "\n".join(self.unread) + r
self.unread = []
return r
def readuntil(self, re, afterblank=0, includematch=0):
"""Read a line at a time until we get the specified RE.
Returns the text up to (and including, if includematch is true) the
matched text, and the RE match object. If afterblank is true,
there must be a blank line before the matched text. Moves current
filepointer to the line following the matched line. If we reach
end-of-file, return what we've got so far, and return None as the
RE match object.
"""
prematch = []
blankseen = 0
while 1:
line = self.readline()
if not line:
# end of file
return EMPTYSTRING.join(prematch), None
if afterblank:
if NLCRE.match(line):
blankseen = 1
continue
else:
blankseen = 0
m = re.match(line)
if (m and not afterblank) or (m and afterblank and blankseen):
if includematch:
prematch.append(line)
return EMPTYSTRING.join(prematch), m
class Parser:prematch.append(line)
! def init(self, _class=Message.Message, strict=False):
"""Parser of RFC 2822 and MIME email messages.
--- 1,19 ----
! # Copyright (C) 2001-2004 Python Software Foundation
! # Author: Barry Warsaw, Thomas Wouters, Anthony Baxter
! # Contact: email-sig at python.org
! """A parser of RFC 2822 and MIME email messages."""
import re
from cStringIO import StringIO
! from email.FeedParser import FeedParser
! from email.Message import Message
NLCRE = re.compile('\r\n|\r|\n')
class Parser:
! def init(self, _class=Message, strict=False):
"""Parser of RFC 2822 and MIME email messages.
***************
*** 118,122 ****
"""
self._class = _class
- self._strict = strict
def parse(self, fp, headersonly=False):
--- 38,41 ----
***************
*** 128,140 ****
meaning it parses the entire contents of the file.
"""
! root = self._class()
! fp = TextUtil(fp)
! self._parseheaders(root, fp)
! if not headersonly:
! obj = self._parsemessage(root, fp)
! trailer = fp.read()
! if obj and trailer:
! self._attach_trailer(obj, trailer)
! return root
def parsestr(self, text, headersonly=False):
--- 47,59 ----
meaning it parses the entire contents of the file.
"""
! feedparser = FeedParser(self._class)
! if headersonly:
! feedparser._set_headersonly()
! while True:
! data = fp.read(8192)
! if not data:
! break
! feedparser.feed(data)
! return feedparser.close()
def parsestr(self, text, headersonly=False):
***************
*** 148,337 ****
return self.parse(StringIO(text), headersonly=headersonly)
- def parseheaders(self, container, fp):
- # Parse the headers, returning a list of header/value pairs. None as
- # the header means the Unix-From header.
- lastheader = ''
- lastvalue = []
- lineno = 0
- while True:
- # Don't strip the line before we test for the end condition,
- # because whitespace-only header lines are RFC compliant
- # continuation lines.
- line = fp.readline()
- if not line:
- break
- line = line.splitlines()[0]
- if not line:
- break
- # Ignore the trailing newline
- lineno += 1
- # Check for initial Unix From line
- if line.startswith('From '):
- if lineno == 1:
- container.set_unixfrom(line)
- continue
- elif self.strict:
- raise Errors.HeaderParseError(
- 'Unix-from in headers after first rfc822 header')
- else:
- # ignore the wierdly placed From line
- # XXX: maybe set unixfrom anyway? or only if not already?
- continue
- # Header continuation line
- if line[0] in ' \t':
- if not lastheader:
- raise Errors.HeaderParseError(
- 'Continuation line seen before first header')
- lastvalue.append(line)
- continue
- # Normal, non-continuation header. BAW: this should check to make
- # sure it's a legal header, e.g. doesn't contain spaces. Also, we
- # should expose the header matching algorithm in the API, and
- # allow for a non-strict parsing mode (that ignores the line
- # instead of raising the exception).
- i = line.find(':')
- if i < 0:
- if self._strict:
- raise Errors.HeaderParseError(
- "Not a header, not a continuation: ``%s''" % line)
- elif lineno == 1 and line.startswith('--'):
- # allow through duplicate boundary tags.
- continue
- else:
- # There was no separating blank line as mandated by RFC
- # 2822, but we're in non-strict mode. So just offer up
- # this current line as the first body line.
- fp.unreadline(line)
- break
- if lastheader:
- container[lastheader] = NL.join(lastvalue)
- lastheader = line[:i]
- lastvalue = [line[i+1:].lstrip()]
- # Make sure we retain the last header
- if lastheader:
- container[lastheader] = NL.join(lastvalue)
- return
def _parsemessage(self, container, fp):
# Parse the body. We walk through the body from top to bottom,
# keeping track of the current multipart nesting as we go.
# We return the object that gets the data at the end of this
# block.
boundary = container.get_boundary()
isdigest = (container.get_content_type() == 'multipart/digest')
if boundary:
separator = '--' + boundary
boundaryRE = re.compile(
r'(?P<sep>' + re.escape(separator) +
r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
preamble, matchobj = fp.readuntil(boundaryRE)
if not matchobj:
# Broken - we hit the end of file. Just set the body
# to the text.
container.set_payload(preamble)
return container
if preamble:
container.preamble = preamble
else:
# The module docs specify an empty preamble is None, not ''
container.preamble = None
while 1:
subobj = self._class()
if isdigest:
subobj.set_default_type('message/rfc822')
firstline = fp.peekline()
if firstline.strip():
# we have MIME headers. all good.
self._parseheaders(subobj, fp)
else:
# no MIME headers. this is allowed for multipart/digest
# Consume the extra blank line
fp.readline()
pass
else:
self._parseheaders(subobj, fp)
container.attach(subobj)
maintype = subobj.get_content_maintype()
hassubparts = (subobj.get_content_maintype() in
( "message", "multipart" ))
if hassubparts:
subobj = self._parsemessage(subobj, fp)
trailer, matchobj = fp.readuntil(boundaryRE)
if matchobj is None or trailer:
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
trailer = trailer[:-len(linesep)]
if trailer:
self._attach_trailer(subobj, trailer)
if matchobj is None or matchobj.group('end'):
# That was the last piece of data. Let our caller attach
# the epilogue to us. But before we do that, push the
# line ending of the match group back into the readline
# buffer, as it's part of the epilogue.
if matchobj:
fp.unreadline(matchobj.group('linesep'))
return container
elif container.get_content_maintype() == "multipart":
# Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError(
'multipart message with no defined boundary')
elif container.get_content_maintype() == "message":
ct = container.get_content_type()
if ct == "message/rfc822":
submessage = self._class()
self._parseheaders(submessage, fp)
self._parsemessage(submessage, fp)
container.attach(submessage)
return submessage
elif ct == "message/delivery-status":
# This special kind of type contains blocks of headers
# separated by a blank line. We'll represent each header
# block as a separate Message object
while 1:
nextblock = self._class()
self._parseheaders(nextblock, fp)
container.attach(nextblock)
# next peek ahead to see whether we've hit the end or not
nextline = fp.peekline()
if nextline[:2] == "--":
break
return container
else:
# Other sort of message object (e.g. external-body)
msg = self._class()
self._parsemessage(msg, fp)
container.attach(msg)
return msg
else:
# single body section. We let our caller set the payload.
return container
def _attach_trailer(self, obj, trailer):
if obj.get_content_maintype() in ("message", "multipart"):
obj.epilogue = trailer
else:
class HeaderParser(Parser):obj.set_payload(trailer)
! """A subclass of Parser, this one only meaningfully parses message headers. ! ! This class can be used if all you're interested in is the headers of a ! message. While it consumes the message body, it does not parse it, but ! simply makes it available as a string payload.
! Parsing with this subclass can be considerably faster if all you're ! interested in is the message headers. ! """ ! def _parsemessage(self, container, fp): ! # Consume but do not parse, the body ! text = fp.read() ! container.set_payload(text) ! return None --- 67,76 ---- return self.parse(StringIO(text), headersonly=headersonly)
class HeaderParser(Parser): ! def parse(self, fp, headersonly=True): ! return Parser.parse(self, fp, True)
! def parsestr(self, text, headersonly=True): ! return Parser.parsestr(self, text, True)
- Previous message: [Python-checkins] python/dist/src/Lib/email Message.py,1.35,1.36
- Next message: [Python-checkins] python/dist/src/Lib/email Utils.py,1.25,1.26
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]