(original) (raw)

changeset: 82660:209a9c2de9bd branch: 3.3 parent: 82655:e45db319e590 parent: 82659:3cb07925fcb9 user: Serhiy Storchaka storchaka@gmail.com date: Thu Mar 14 21:33:35 2013 +0200 files: Misc/NEWS description: Issue #1285086: Get rid of the refcounting hack and speed up urllib.parse.unquote() and urllib.parse.unquote_to_bytes(). diff -r e45db319e590 -r 209a9c2de9bd Lib/urllib/parse.py --- a/Lib/urllib/parse.py Wed Mar 13 21:35:07 2013 -0400 +++ b/Lib/urllib/parse.py Thu Mar 14 21:33:35 2013 +0200 @@ -27,6 +27,7 @@ test_urlparse.py provides a good indicator of parsing behavior. """ +import re import sys import collections @@ -470,6 +471,10 @@ defrag = url return _coerce_result(DefragResult(defrag, frag)) +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)]) + for a in _hexdig for b in _hexdig} + def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -480,16 +485,21 @@ return b'' if isinstance(string, str): string = string.encode('utf-8') - res = string.split(b'%') - if len(res) == 1: + bits = string.split(b'%') + if len(bits) == 1: return string - string = res[0] - for item in res[1:]: + res = [bits[0]] + append = res.append + for item in bits[1:]: try: - string += bytes([int(item[:2], 16)]) + item[2:] - except ValueError: - string += b'%' + item - return string + append(_hextobyte[item[:2]]) + append(item[2:]) + except KeyError: + append(b'%') + append(item) + return b''.join(res) + +_asciire = re.compile('([\x00-\x7f]+)') def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -501,39 +511,20 @@ unquote('abc%20def') -> 'abc def'. """ - if string == '': - return string - res = string.split('%') - if len(res) == 1: + if '%' not in string: + string.split return string if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += bytes.fromhex(item[:2]) - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string + bits = _asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + append(unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) + return ''.join(res) def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): diff -r e45db319e590 -r 209a9c2de9bd Misc/NEWS --- a/Misc/NEWS Wed Mar 13 21:35:07 2013 -0400 +++ b/Misc/NEWS Thu Mar 14 21:33:35 2013 +0200 @@ -193,6 +193,9 @@ Library ------- +- Issue #1285086: Get rid of the refcounting hack and speed up + urllib.parse.unquote() and urllib.parse.unquote_to_bytes(). + - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused a failure while decoding empty object literals when object_pairs_hook was specified. /storchaka@gmail.com