[Python-checkins] r81271 - in python/branches/py3k: Lib/urllib/parse.py Misc/NEWS (original) (raw)
florent.xicluna python-checkins at python.org
Mon May 17 19:33:07 CEST 2010
- Previous message: [Python-checkins] r81270 - in python/branches/py3k: Lib/urllib/request.py
- Next message: [Python-checkins] r81272 - python/branches/py3k/Lib/urllib/parse.py
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Author: florent.xicluna Date: Mon May 17 19:33:07 2010 New Revision: 81271
Log: Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes, unquote, unquote_to_bytes.
Recorded merge of revisions 81265 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk
........ r81265 | florent.xicluna | 2010-05-17 15:35:09 +0200 (lun, 17 mai 2010) | 2 lines
Issue #1285086: Speed up urllib.quote and urllib.unquote for simple cases. ........
Modified: python/branches/py3k/ (props changed) python/branches/py3k/Lib/urllib/parse.py python/branches/py3k/Misc/NEWS
Modified: python/branches/py3k/Lib/urllib/parse.py
--- python/branches/py3k/Lib/urllib/parse.py (original) +++ python/branches/py3k/Lib/urllib/parse.py Mon May 17 19:33:07 2010 @@ -41,7 +41,7 @@ uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'imap', 'wais', 'file', 'mms', 'https', 'shttp', 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
'svn', 'svn+ssh', 'sftp', 'nfs',' git', 'git+ssh']
'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', @@ -307,17 +307,20 @@ """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains # unescaped non-ASCII characters, which URIs should not.
- if not string:
if isinstance(string, str): string = string.encode('utf-8') res = string.split(b'%')return b''
- res[0] = res[0]
- for i in range(1, len(res)):
item = res[i]
- if len(res) == 1:
return string
- string = res[0]
- for item in res[1:]: try:
res[i] = bytes([int(item[:2], 16)]) + item[2:]
string += bytes([int(item[:2], 16)]) + item[2:] except ValueError:
res[i] = b'%' + item
- return b''.join(res)
string += b'%' + item
- return string
def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -329,36 +332,39 @@
unquote('abc%20def') -> 'abc def'.
"""
- if encoding is None: encoding = 'utf-8'
- if errors is None: errors = 'replace'
pct_sequence: contiguous sequence of percent-encoded bytes, decoded
(list of single-byte bytes objects)
- pct_sequence = []
- if not string:
res = string.split('%')return string
- for i in range(1, len(res)):
item = res[i]
- if len(res) == 1:
return string
- if encoding is None:
encoding = 'utf-8'
- if errors is None:
errors = 'replace'
pct_sequence: contiguous sequence of percent-encoded bytes
- pct_sequence = b''
- string = res[0]
- for item in res[1:]: try:
if not item: raise ValueError
pct_sequence.append(bytes.fromhex(item[:2]))
if not item:
raise ValueError
pct_sequence += bytes.fromhex(item[:2]) rest = item[2:]
if not rest:
# This segment was just a single percent-encoded character.
# May be part of a sequence of code units, so delay decoding.
# (Stored in pct_sequence).
continue except ValueError: rest = '%' + item
if not rest:
# This segment was just a single percent-encoded character.
# May be part of a sequence of code units, so delay decoding.
# (Stored in pct_sequence).
res[i] = ''
else:
# Encountered non-percent-encoded characters. Flush the current
# pct_sequence.
res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
pct_sequence = []
# Encountered non-percent-encoded characters. Flush the current
# pct_sequence.
string += pct_sequence.decode(encoding, errors) + rest
if pct_sequence: # Flush the final pct_sequencepct_sequence = b''
# res[-1] will always be empty if pct_sequence != []
assert not res[-1], "string=%r, res=%r" % (string, res)
res[-1] = b''.join(pct_sequence).decode(encoding, errors)
- return ''.join(res)
string += pct_sequence.decode(encoding, errors)
- return string
def parse_qs(qs, keep_blank_values=False, strict_parsing=False): """Parse a query given as a string argument. @@ -439,7 +445,8 @@ b'abcdefghijklmnopqrstuvwxyz' b'0123456789' b'_.-') -_safe_quoters= {} +_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) +_safe_quoters = {}
class Quoter(collections.defaultdict): """A mapping from bytes (in range(0,256)) to strings. @@ -451,7 +458,7 @@ # of cached keys don't call Python code at all). def init(self, safe): """safe: bytes object."""
self.safe = _ALWAYS_SAFE.union(c for c in safe if c < 128)
def repr(self): # Without this, will just display as a defaultdictself.safe = _ALWAYS_SAFE.union(safe)
@@ -459,7 +466,7 @@
def __missing__(self, b):
# Handle a cache miss. Store quoted string in cache and return.
res = b in self.safe and chr(b) or ('%%%02X' % b)
res = chr(b) if b in self.safe else '%{:02X}'.format(b) self[b] = res return res
@@ -493,6 +500,8 @@ errors='strict' (unsupported characters raise a UnicodeEncodeError). """ if isinstance(string, str):
if not string:
return string if encoding is None: encoding = 'utf-8' if errors is None:
@@ -527,18 +536,22 @@ not perform string-to-bytes encoding. It always returns an ASCII string. quote_from_bytes(b'abc def\xab') -> 'abc%20def%AB' """
- if not isinstance(bs, (bytes, bytearray)):
raise TypeError("quote_from_bytes() expected bytes")
- if not bs:
if isinstance(safe, str): # Normalize 'safe' by converting to bytes and removing non-ASCII chars safe = safe.encode('ascii', 'ignore')return ''
- cachekey = bytes(safe) # In case it was a bytearray
- if not (isinstance(bs, bytes) or isinstance(bs, bytearray)):
raise TypeError("quote_from_bytes() expected a bytes")
- else:
safe = bytes([c for c in safe if c < 128])
- if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
try:return bs.decode()
quoter = _safe_quoters[cachekey]
except KeyError:quoter = _safe_quoters[safe]
quoter = Quoter(safe)
_safe_quoters[cachekey] = quoter
- return ''.join([quoter[char] for char in bs])
_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
- return ''.join([quoter(char) for char in bs])
def urlencode(query, doseq=False): """Encode a sequence of two-element tuples or dictionary into a URL query string.
Modified: python/branches/py3k/Misc/NEWS
--- python/branches/py3k/Misc/NEWS (original) +++ python/branches/py3k/Misc/NEWS Mon May 17 19:33:07 2010 @@ -366,6 +366,9 @@ Library
+- Issue #1285086: Speed up urllib.parse functions: quote, quote_from_bytes,
- unquote, unquote_to_bytes.
Issue #8688: Distutils now recalculates MANIFEST everytime.
Issue #8477: ssl.RAND_egd() and ssl._test_decode_cert() support str with
- Previous message: [Python-checkins] r81270 - in python/branches/py3k: Lib/urllib/request.py
- Next message: [Python-checkins] r81272 - python/branches/py3k/Lib/urllib/parse.py
- Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]