cpython: 209a9c2de9bd (original) (raw)
Mercurial > cpython
changeset 82660:209a9c2de9bd 3.3
Issue #1285086: Get rid of the refcounting hack and speed up urllib.parse.unquote() and urllib.parse.unquote_to_bytes(). [#1285086]
Serhiy Storchaka storchaka@gmail.com | |
---|---|
date | Thu, 14 Mar 2013 21:33:35 +0200 |
parents | e45db319e590(current diff)3cb07925fcb9(diff) |
children | 9367411a261e 937989570b42 |
files | Misc/NEWS |
diffstat | 2 files changed, 30 insertions(+), 36 deletions(-)[+] [-] Lib/urllib/parse.py 63 Misc/NEWS 3 |
line wrap: on
line diff
--- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,6 +27,7 @@ parsing quirks from older RFCs are retai test_urlparse.py provides a good indicator of parsing behavior. """ +import re import sys import collections @@ -470,6 +471,10 @@ def urldefrag(url): defrag = url return _coerce_result(DefragResult(defrag, frag)) +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])
for a in _hexdig for b in _hexdig}[](#l1.17)
+ def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -480,16 +485,21 @@ def unquote_to_bytes(string): return b'' if isinstance(string, str): string = string.encode('utf-8')
string += bytes([int(item[:2], 16)]) + item[2:][](#l1.37)
except ValueError:[](#l1.38)
string += b'%' + item[](#l1.39)
- return string
append(_hextobyte[item[:2]])[](#l1.41)
append(item[2:])[](#l1.42)
except KeyError:[](#l1.43)
append(b'%')[](#l1.44)
append(item)[](#l1.45)
- return b''.join(res)
+ +_asciire = re.compile('([\x00-\x7f]+)') def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', er unquote('abc%20def') -> 'abc def'. """
- if '%' not in string:
if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace'string.split[](#l1.61) return string[](#l1.62)
pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- pct_sequence = b''
- string = res[0]
- for item in res[1:]:
try:[](#l1.71)
if not item:[](#l1.72)
raise ValueError[](#l1.73)
pct_sequence += bytes.fromhex(item[:2])[](#l1.74)
rest = item[2:][](#l1.75)
if not rest:[](#l1.76)
# This segment was just a single percent-encoded character.[](#l1.77)
# May be part of a sequence of code units, so delay decoding.[](#l1.78)
# (Stored in pct_sequence).[](#l1.79)
continue[](#l1.80)
except ValueError:[](#l1.81)
rest = '%' + item[](#l1.82)
# Encountered non-percent-encoded characters. Flush the current[](#l1.83)
# pct_sequence.[](#l1.84)
string += pct_sequence.decode(encoding, errors) + rest[](#l1.85)
pct_sequence = b''[](#l1.86)
- if pct_sequence:
# Flush the final pct_sequence[](#l1.88)
string += pct_sequence.decode(encoding, errors)[](#l1.89)
- return string
- bits = _asciire.split(string)
- res = [bits[0]]
- append = res.append
- for i in range(1, len(bits), 2):
append(unquote_to_bytes(bits[i]).decode(encoding, errors))[](#l1.95)
append(bits[i + 1])[](#l1.96)
- return ''.join(res)
def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'):
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -193,6 +193,9 @@ Core and Builtins Library ------- +- Issue #1285086: Get rid of the refcounting hack and speed up