(original) (raw)

--- ../feedparser/feedparser.py (original) +++ ../feedparser/feedparser.py (refactored) @@ -77,9 +77,9 @@ # ---------- required modules (should come with any Python distribution) ---------- import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 try: - from cStringIO import StringIO as _StringIO + from io import StringIO as _StringIO except: - from StringIO import StringIO as _StringIO + from io import StringIO as _StringIO # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- @@ -146,8 +146,8 @@ import htmlentitydefs name2codepoint={} codepoint2name={} - for (name,codepoint) in htmlentitydefs.entitydefs.iteritems(): - if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1])) + for (name,codepoint) in htmlentitydefs.entitydefs.items(): + if codepoint.startswith('&#'): codepoint=chr(int(codepoint[2:-1])) name2codepoint[name]=ord(codepoint) codepoint2name[ord(codepoint)]=name @@ -228,16 +228,16 @@ if key == 'category': return UserDict.__getitem__(self, 'tags')[0]['term'] if key == 'enclosures': - norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) + norel = lambda link: FeedParserDict([(name,value) for (name,value) in list(link.items()) if name!='rel']) return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] if key == 'license': for link in UserDict.__getitem__(self, 'links'): - if link['rel']=='license' and link.has_key('href'): + if link['rel']=='license' and 'href' in link: return link['href'] if key == 'categories': return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] realkey = self.keymap.get(key, key) - if type(realkey) == types.ListType: + if isinstance(realkey, list): for k in realkey: if UserDict.has_key(self, k): return UserDict.__getitem__(self, k) @@ -246,21 +246,21 @@ return UserDict.__getitem__(self, realkey) def __setitem__(self, key, value): - for k in self.keymap.keys(): + for k in list(self.keymap.keys()): if key == k: key = self.keymap[k] - if type(key) == types.ListType: + if isinstance(key, list): key = key[0] return UserDict.__setitem__(self, key, value) def get(self, key, default=None): - if self.has_key(key): + if key in self: return self[key] else: return default def setdefault(self, key, value): - if not self.has_key(key): + if key not in self: self[key] = value return self[key] @@ -279,7 +279,7 @@ assert not key.startswith('_') return self.__getitem__(key) except: - raise AttributeError, "object has no attribute '%s'" % key + raise AttributeError("object has no attribute '%s'" % key) def __setattr__(self, key, value): if key.startswith('_') or key == 'data': @@ -288,7 +288,7 @@ return self.__setitem__(key, value) def __contains__(self, key): - return self.has_key(key) + return key in self def zopeCompatibilityHack(): global FeedParserDict @@ -327,33 +327,33 @@ return s.translate(_ebcdic_to_ascii_map) _cp1252 = { - unichr(128): unichr(8364), # euro sign - unichr(130): unichr(8218), # single low-9 quotation mark - unichr(131): unichr( 402), # latin small letter f with hook - unichr(132): unichr(8222), # double low-9 quotation mark - unichr(133): unichr(8230), # horizontal ellipsis - unichr(134): unichr(8224), # dagger - unichr(135): unichr(8225), # double dagger - unichr(136): unichr( 710), # modifier letter circumflex accent - unichr(137): unichr(8240), # per mille sign - unichr(138): unichr( 352), # latin capital letter s with caron - unichr(139): unichr(8249), # single left-pointing angle quotation mark - unichr(140): unichr( 338), # latin capital ligature oe - unichr(142): unichr( 381), # latin capital letter z with caron - unichr(145): unichr(8216), # left single quotation mark - unichr(146): unichr(8217), # right single quotation mark - unichr(147): unichr(8220), # left double quotation mark - unichr(148): unichr(8221), # right double quotation mark - unichr(149): unichr(8226), # bullet - unichr(150): unichr(8211), # en dash - unichr(151): unichr(8212), # em dash - unichr(152): unichr( 732), # small tilde - unichr(153): unichr(8482), # trade mark sign - unichr(154): unichr( 353), # latin small letter s with caron - unichr(155): unichr(8250), # single right-pointing angle quotation mark - unichr(156): unichr( 339), # latin small ligature oe - unichr(158): unichr( 382), # latin small letter z with caron - unichr(159): unichr( 376)} # latin capital letter y with diaeresis + chr(128): chr(8364), # euro sign + chr(130): chr(8218), # single low-9 quotation mark + chr(131): chr( 402), # latin small letter f with hook + chr(132): chr(8222), # double low-9 quotation mark + chr(133): chr(8230), # horizontal ellipsis + chr(134): chr(8224), # dagger + chr(135): chr(8225), # double dagger + chr(136): chr( 710), # modifier letter circumflex accent + chr(137): chr(8240), # per mille sign + chr(138): chr( 352), # latin capital letter s with caron + chr(139): chr(8249), # single left-pointing angle quotation mark + chr(140): chr( 338), # latin capital ligature oe + chr(142): chr( 381), # latin capital letter z with caron + chr(145): chr(8216), # left single quotation mark + chr(146): chr(8217), # right single quotation mark + chr(147): chr(8220), # left double quotation mark + chr(148): chr(8221), # right double quotation mark + chr(149): chr(8226), # bullet + chr(150): chr(8211), # en dash + chr(151): chr(8212), # em dash + chr(152): chr( 732), # small tilde + chr(153): chr(8482), # trade mark sign + chr(154): chr( 353), # latin small letter s with caron + chr(155): chr(8250), # single right-pointing angle quotation mark + chr(156): chr( 339), # latin small ligature oe + chr(158): chr( 382), # latin small letter z with caron + chr(159): chr( 376)} # latin capital letter y with diaeresis _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') def _urljoin(base, uri): @@ -437,7 +437,7 @@ def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): if _debug: sys.stderr.write('initializing FeedParser\n') if not self._matchnamespaces: - for k, v in self.namespaces.items(): + for k, v in list(self.namespaces.items()): self._matchnamespaces[k.lower()] = v self.feeddata = FeedParserDict() # feed-level data self.encoding = encoding # character encoding @@ -501,7 +501,7 @@ self.trackNamespace(None, uri) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): # element declared itself as escaped markup, but it isn't really self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': @@ -513,7 +513,7 @@ # because that compensates for the bugs in our namespace handling. # This will horribly munge inline content with non-empty qnames, # but nobody actually does that, so I'm not fixing it. - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, tag = tag.split(':', 1) namespace = self.namespacesInUse.get(prefix, '') if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': @@ -524,7 +524,7 @@ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) # match namespaces - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag @@ -549,7 +549,7 @@ def unknown_endtag(self, tag): if _debug: sys.stderr.write('end %s\n' % tag) # match namespaces - if tag.find(':') <> -1: + if tag.find(':') != -1: prefix, suffix = tag.split(':', 1) else: prefix, suffix = '', tag @@ -567,7 +567,7 @@ self.pop(prefix + suffix) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): # element declared itself as escaped markup, but it isn't really self.contentparams['type'] = 'application/xhtml+xml' if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': @@ -595,7 +595,7 @@ c = int(ref[1:], 16) else: c = int(ref) - text = unichr(c).encode('utf-8') + text = chr(c).encode('utf-8') self.elementstack[-1][2].append(text) def handle_entityref(self, ref): @@ -604,14 +604,14 @@ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref - elif ref in self.entities.keys(): + elif ref in list(self.entities.keys()): text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref - else: text = unichr(name2codepoint[ref]).encode('utf-8') + else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text) def handle_data(self, text, escape=1): @@ -663,11 +663,11 @@ self.version = 'rss10' if loweruri == 'http://www.w3.org/2005/atom' and not self.version: self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') <> -1: + if loweruri.find('backend.userland.com/rss') != -1: # match any backend.userland.com namespace uri = 'http://backend.userland.com/rss' loweruri = uri - if self._matchnamespaces.has_key(loweruri): + if loweruri in self._matchnamespaces: self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri else: @@ -773,23 +773,23 @@ if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) - if self.encoding and type(output) != type(u''): + if self.encoding and not isinstance(output, type('')): try: - output = unicode(output, self.encoding) + output = str(output, self.encoding) except: pass # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(u''): + if self.encoding=='utf-8' and isinstance(output, type('')): try: - output = unicode(output.encode('iso-8859-1'), 'utf-8') + output = str(output.encode('iso-8859-1'), 'utf-8') except: pass # map win-1252 extensions to the proper code points - if type(output) == type(u''): - output = u''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) + if isinstance(output, type('')): + output = ''.join([c in list(_cp1252.keys()) and _cp1252[c] or c for c in output]) # categories/tags/keywords/whatever are handled in _end_category if element == 'category': @@ -855,19 +855,17 @@ if not (re.search(r'',str) or re.search("&#?\w+;",str)): return # all tags must be in a restricted subset of valid HTML tags - if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements, - re.findall(r' -1: + if colonpos != -1: prefix = name[:colonpos] suffix = name[colonpos+1:] prefix = self.namespacemap.get(prefix, prefix) @@ -930,11 +928,11 @@ _start_feedinfo = _start_channel def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): + if 'lastmod' in attrsD: self._start_modified({}) self.elementstack[-1][-1] = attrsD['lastmod'] self._end_modified() - if attrsD.has_key('href'): + if 'href' in attrsD: self._start_link({}) self.elementstack[-1][-1] = attrsD['href'] self._end_link() @@ -1333,14 +1331,14 @@ attrsD.setdefault('type', 'text/html') context = self._getContext() attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): + if 'href' in attrsD: attrsD['href'] = self.resolveURI(attrsD['href']) if attrsD.get('rel')=='enclosure' and not context.get('id'): context['id'] = attrsD.get('href') expectingText = self.infeed or self.inentry or self.insource context.setdefault('links', []) context['links'].append(FeedParserDict(attrsD)) - if attrsD.has_key('href'): + if 'href' in attrsD: expectingText = 0 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): context['link'] = attrsD['href'] @@ -1359,14 +1357,14 @@ def _end_guid(self): value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) + self._save('guidislink', self.guidislink and 'link' not in self._getContext()) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element self._save('link', value) def _start_title(self, attrsD): - if self.svgOK: return self.unknown_starttag('title', attrsD.items()) + if self.svgOK: return self.unknown_starttag('title', list(attrsD.items())) self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) _start_dc_title = _start_title _start_media_title = _start_title @@ -1381,7 +1379,7 @@ def _start_description(self, attrsD): context = self._getContext() - if context.has_key('summary'): + if 'summary' in context: self._summaryKey = 'content' self._start_content(attrsD) else: @@ -1411,7 +1409,7 @@ def _start_generator(self, attrsD): if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): + if 'href' in attrsD: attrsD['href'] = self.resolveURI(attrsD['href']) self._getContext()['generator_detail'] = FeedParserDict(attrsD) self.push('generator', 1) @@ -1419,7 +1417,7 @@ def _end_generator(self): value = self.pop('generator') context = self._getContext() - if context.has_key('generator_detail'): + if 'generator_detail' in context: context['generator_detail']['name'] = value def _start_admin_generatoragent(self, attrsD): @@ -1439,7 +1437,7 @@ def _start_summary(self, attrsD): context = self._getContext() - if context.has_key('summary'): + if 'summary' in context: self._summaryKey = 'content' self._start_content(attrsD) else: @@ -1530,7 +1528,7 @@ def startElementNS(self, name, qname, attrs): namespace, localname = name lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') <> -1: + if lowernamespace.find('backend.userland.com/rss') != -1: # match any backend.userland.com namespace namespace = 'http://backend.userland.com/rss' lowernamespace = namespace @@ -1539,8 +1537,8 @@ else: givenprefix = None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix + if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: + raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) localname = str(localname).lower() # qname implementation is horribly broken in Python 2.1 (it @@ -1559,13 +1557,13 @@ if prefix: localname = prefix.lower() + ':' + localname elif namespace and not qname: #Expat - for name,value in self.namespacesInUse.items(): + for name,value in list(self.namespacesInUse.items()): if name and value == namespace: localname = name + ':' + localname break - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) - - for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): + if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, list(attrs.items()), localname)) + + for (namespace, attrlocalname), attrvalue in list(attrs._attrs.items()): lowernamespace = (namespace or '').lower() prefix = self._matchnamespaces.get(lowernamespace, '') if prefix: @@ -1573,7 +1571,7 @@ attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) - self.unknown_starttag(localname, attrsD.items()) + self.unknown_starttag(localname, list(attrsD.items())) def characters(self, text): self.handle_data(text) @@ -1589,7 +1587,7 @@ if prefix: localname = prefix + ':' + localname elif namespace and not qname: #Expat - for name,value in self.namespacesInUse.items(): + for name,value in list(self.namespacesInUse.items()): if name and value == namespace: localname = name + ':' + localname break @@ -1640,7 +1638,7 @@ data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') - if self.encoding and type(data) == type(u''): + if self.encoding and isinstance(data, type('')): data = data.encode(self.encoding) sgmllib.SGMLParser.feed(self, data) sgmllib.SGMLParser.close(self) @@ -1648,7 +1646,7 @@ def normalize_attrs(self, attrs): if not attrs: return attrs # utility method to be called by descendants - attrs = dict([(k.lower(), v) for k, v in attrs]).items() + attrs = list(dict([(k.lower(), v) for k, v in attrs]).items()) attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] attrs.sort() return attrs @@ -1665,13 +1663,13 @@ value=value.replace('>','>').replace('<','<').replace('"','"') value = self.bare_ampersand.sub("&", value) # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds - if type(value) != type(u''): + if not isinstance(value, type('')): try: - value = unicode(value, self.encoding) + value = str(value, self.encoding) except: - value = unicode(value, 'iso-8859-1') - uattrs.append((unicode(key, self.encoding), value)) - strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]) + value = str(value, 'iso-8859-1') + uattrs.append((str(key, self.encoding), value)) + strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs]) if self.encoding: try: strattrs=strattrs.encode(self.encoding) @@ -1692,11 +1690,11 @@ # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. if ref.startswith('x'): - value = unichr(int(ref[1:],16)) + value = chr(int(ref[1:],16)) else: - value = unichr(int(ref)) - - if value in _cp1252.keys(): + value = chr(int(ref)) + + if value in list(_cp1252.keys()): self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:]) else: self.pieces.append('&#%(ref)s;' % locals()) @@ -1704,7 +1702,7 @@ def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - if name2codepoint.has_key(ref): + if ref in name2codepoint: self.pieces.append('&%(ref)s;' % locals()) else: self.pieces.append('&%(ref)s' % locals()) @@ -1781,7 +1779,7 @@ data = data.replace('"', '"') data = data.replace(''', ''') data = data.replace(''', ''') - if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): data = data.replace('<', '<') data = data.replace('>', '>') data = data.replace('&', '&') @@ -1806,7 +1804,7 @@ self.document = BeautifulSoup.BeautifulSoup(data) self.baseuri = baseuri self.encoding = encoding - if type(data) == type(u''): + if isinstance(data, type('')): data = data.encode(encoding) self.tags = [] self.enclosures = [] @@ -1814,7 +1812,7 @@ self.vcard = None def vcardEscape(self, s): - if type(s) in (type(''), type(u'')): + if type(s) in (type(''), type('')): s = s.replace(',', '\\,').replace(';', '\\;').replace('\n', '\\n') return s @@ -2147,7 +2145,7 @@ def isProbablyDownloadable(self, elm): attrsD = elm.attrMap - if not attrsD.has_key('href'): return 0 + if 'href' not in attrsD: return 0 linktype = attrsD.get('type', '').strip() if linktype.startswith('audio/') or \ linktype.startswith('video/') or \ @@ -2415,7 +2413,7 @@ # declare xlink namespace, if needed if self.mathmlOK or self.svgOK: - if filter(lambda (n,v): n.startswith('xlink:'),attrs): + if [n_v for n_v in attrs if n_v[0].startswith('xlink:')]: if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs: attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink')) @@ -2502,12 +2500,12 @@ except: pass if _tidy: - utf8 = type(data) == type(u'') + utf8 = isinstance(data, type('')) if utf8: data = data.encode('utf-8') data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") if utf8: - data = unicode(data, 'utf-8') + data = str(data, 'utf-8') if data.count('<body'): data="data.split('<body'," 1)[1]="" if="" data.count('="">'): @@ -2526,7 +2524,7 @@ return infourl def http_error_302(self, req, fp, code, msg, headers): - if headers.dict.has_key('location'): + if 'location' in headers.dict: infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) else: infourl = urllib.addinfourl(fp, headers, req.get_full_url()) @@ -2535,7 +2533,7 @@ return infourl def http_error_301(self, req, fp, code, msg, headers): - if headers.dict.has_key('location'): + if 'location' in headers.dict: infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) else: infourl = urllib.addinfourl(fp, headers, req.get_full_url()) @@ -2622,7 +2620,7 @@ # iri support try: - if isinstance(url_file_stream_or_string,unicode): + if isinstance(url_file_stream_or_string,str): url_file_stream_or_string = url_file_stream_or_string.encode('idna') else: url_file_stream_or_string = url_file_stream_or_string.decode('utf-8').encode('idna') @@ -2634,7 +2632,7 @@ request.add_header('User-Agent', agent) if etag: request.add_header('If-None-Match', etag) - if type(modified) == type(''): + if isinstance(modified, type('')): modified = _parse_date(modified) if modified: # format into an RFC 1123-compliant timestamp. We can't use @@ -2659,7 +2657,7 @@ if ACCEPT_HEADER: request.add_header('Accept', ACCEPT_HEADER) request.add_header('A-IM', 'feed') # RFC 3229 support - opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) + opener = urllib2.build_opener(*tuple([_FeedURLHandler()] + handlers)) opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) @@ -2755,7 +2753,7 @@ day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): + if 'century' in list(params.keys()): year = (int(params['century']) - 1) * 100 + 1 # in ISO 8601 most fields are optional for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: @@ -2787,17 +2785,17 @@ registerDateHandler(_parse_date_iso8601) # 8-bit date handling routines written by ytrewq1. -_korean_year = u'\ub144' # b3e2 in euc-kr -_korean_month = u'\uc6d4' # bff9 in euc-kr -_korean_day = u'\uc77c' # c0cf in euc-kr -_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr +_korean_year = '\ub144' # b3e2 in euc-kr +_korean_month = '\uc6d4' # bff9 in euc-kr +_korean_day = '\uc77c' # c0cf in euc-kr +_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr +_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr _korean_onblog_date_re = \ re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ (_korean_year, _korean_month, _korean_day)) _korean_nate_date_re = \ - re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ + re.compile('(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ (_korean_am, _korean_pm)) def _parse_date_onblog(dateString): '''Parse a string according to the OnBlog 8-bit date format''' @@ -2847,40 +2845,40 @@ # Unicode strings for Greek date strings _greek_months = \ { \ - u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 - u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 - u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 - u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 - u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 - u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 - u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 - u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 - u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 - u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 - u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 - u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 - u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 - u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 - u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 - u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 - u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 - u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 + '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 + '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 + '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 + '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 + '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 + '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 + '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 + '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 + '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 + '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 + '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 + '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 + '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 + '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 + '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 + '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 + '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 + '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 + '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 } _greek_wdays = \ { \ - u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 - u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 - u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 - u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 - u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 - u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 - u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 + '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 + '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 + '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 + '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 + '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 + '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 + '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 } _greek_date_format_re = \ - re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') + re.compile('([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') def _parse_date_greek(dateString): '''Parse a string according to a Greek 8-bit date format.''' @@ -2902,22 +2900,22 @@ # Unicode strings for Hungarian date strings _hungarian_months = \ { \ - u'janu\u00e1r': u'01', # e1 in iso-8859-2 - u'febru\u00e1ri': u'02', # e1 in iso-8859-2 - u'm\u00e1rcius': u'03', # e1 in iso-8859-2 - u'\u00e1prilis': u'04', # e1 in iso-8859-2 - u'm\u00e1ujus': u'05', # e1 in iso-8859-2 - u'j\u00fanius': u'06', # fa in iso-8859-2 - u'j\u00falius': u'07', # fa in iso-8859-2 - u'augusztus': u'08', - u'szeptember': u'09', - u'okt\u00f3ber': u'10', # f3 in iso-8859-2 - u'november': u'11', - u'december': u'12', + 'janu\u00e1r': '01', # e1 in iso-8859-2 + 'febru\u00e1ri': '02', # e1 in iso-8859-2 + 'm\u00e1rcius': '03', # e1 in iso-8859-2 + '\u00e1prilis': '04', # e1 in iso-8859-2 + 'm\u00e1ujus': '05', # e1 in iso-8859-2 + 'j\u00fanius': '06', # fa in iso-8859-2 + 'j\u00falius': '07', # fa in iso-8859-2 + 'augusztus': '08', + 'szeptember': '09', + 'okt\u00f3ber': '10', # f3 in iso-8859-2 + 'november': '11', + 'december': '12', } _hungarian_date_format_re = \ - re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') + re.compile('(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') def _parse_date_hungarian(dateString): '''Parse a string according to a Hungarian 8-bit date format.''' @@ -3090,7 +3088,7 @@ raise ValueError map(int, date9tuple) return date9tuple - except Exception, e: + except Exception as e: if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) pass return None @@ -3169,39 +3167,39 @@ elif xml_data[:4] == '\x00\x3c\x00\x3f': # UTF-16BE sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') + xml_data = str(xml_data, 'utf-16be').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x3f\x00': # UTF-16LE sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') + xml_data = str(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') + xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') elif xml_data[:4] == '\x00\x00\x00\x3c': # UTF-32BE sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') + xml_data = str(xml_data, 'utf-32be').encode('utf-8') elif xml_data[:4] == '\x3c\x00\x00\x00': # UTF-32LE sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') + xml_data = str(xml_data, 'utf-32le').encode('utf-8') elif xml_data[:4] == '\x00\x00\xfe\xff': # UTF-32BE with BOM sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') elif xml_data[:4] == '\xff\xfe\x00\x00': # UTF-32LE with BOM sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') + xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') elif xml_data[:3] == '\xef\xbb\xbf': # UTF-8 with BOM sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') + xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') else: # ASCII-compatible pass @@ -3225,7 +3223,7 @@ true_encoding = http_encoding or 'us-ascii' elif http_content_type.startswith('text/'): true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): + elif http_headers and ('content-type' not in http_headers): true_encoding = xml_encoding or 'iso-8859-1' else: true_encoding = xml_encoding or 'utf-8' @@ -3274,14 +3272,14 @@ sys.stderr.write('trying utf-32le instead\n') encoding = 'utf-32le' data = data[4:] - newdata = unicode(data, encoding) + newdata = str(data, encoding) if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) declmatch = re.compile('^<\?xml[^>]*?>') newdecl = '''''' if declmatch.search(newdata): newdata = declmatch.sub(newdecl, newdata) else: - newdata = newdecl + u'\n' + newdata + newdata = newdecl + '\n' + newdata return newdata.encode('utf-8') def _stripDoctype(data): @@ -3305,7 +3303,7 @@ replacement='' if len(doctype_results)==1 and entity_results: safe_pattern=re.compile('\s+(\w+)\s+"(&#\w+;|[^&"]*)"') - safe_entities=filter(lambda e: safe_pattern.match(e),entity_results) + safe_entities=[e for e in entity_results if safe_pattern.match(e)] if safe_entities: replacement='\n]>' % '>\n </body'):>