cpython: 3cb07925fcb9 (original) (raw)

Mercurial > cpython

changeset 82659:3cb07925fcb9 3.2

Issue #1285086: Get rid of the refcounting hack and speed up urllib.parse.unquote() and urllib.parse.unquote_to_bytes(). [#1285086]

Serhiy Storchaka storchaka@gmail.com
date	Thu, 14 Mar 2013 21:31:37 +0200
parents	9b45873e5a68
children	209a9c2de9bd 4b28a6a3eda6
files	Lib/urllib/parse.py Misc/NEWS
diffstat	2 files changed, 30 insertions(+), 36 deletions(-)[+] [-] Lib/urllib/parse.py 63 Misc/NEWS 3

line wrap: on

line diff

--- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -27,6 +27,7 @@ parsing quirks from older RFCs are retai test_urlparse.py provides a good indicator of parsing behavior. """ +import re import sys import collections @@ -470,6 +471,10 @@ def urldefrag(url): defrag = url return _coerce_result(DefragResult(defrag, frag)) +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte = {(a + b).encode(): bytes([int(a + b, 16)])

         for a in _hexdig for b in _hexdig}[](#l1.17)

+ def unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -480,16 +485,21 @@ def unquote_to_bytes(string): return b'' if isinstance(string, str): string = string.encode('utf-8')

res = string.split(b'%')
if len(res) == 1:

bits = string.split(b'%')
if len(bits) == 1: return string

string = res[0]
for item in res[1:]:

res = [bits[0]]
append = res.append
for item in bits[1:]: try:

       string += bytes([int(item[:2], 16)]) + item[2:][](#l1.37)

```
   except ValueError:[](#l1.38)
```
```
       string += b'%' + item[](#l1.39)
```
return string

       append(_hextobyte[item[:2]])[](#l1.41)

```
       append(item[2:])[](#l1.42)
```
```
   except KeyError:[](#l1.43)
```
```
       append(b'%')[](#l1.44)
```
```
       append(item)[](#l1.45)
```
return b''.join(res)

+ +_asciire = re.compile('([\x00-\x7f]+)') def unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional @@ -501,39 +511,20 @@ def unquote(string, encoding='utf-8', er unquote('abc%20def') -> 'abc def'. """

if string == '':
```
   return string[](#l1.57)
```
res = string.split('%')
if len(res) == 1:

if '%' not in string:
```
   string.split[](#l1.61)
   return string[](#l1.62)
```
if encoding is None: encoding = 'utf-8' if errors is None: errors = 'replace'

pct_sequence: contiguous sequence of percent-encoded bytes, decoded
pct_sequence = b''
string = res[0]
for item in res[1:]:
```
   try:[](#l1.71)
```
```
       if not item:[](#l1.72)
```
```
           raise ValueError[](#l1.73)
```

       pct_sequence += bytes.fromhex(item[:2])[](#l1.74)

```
       rest = item[2:][](#l1.75)
```
```
       if not rest:[](#l1.76)
```

           # This segment was just a single percent-encoded character.[](#l1.77)

           # May be part of a sequence of code units, so delay decoding.[](#l1.78)

           # (Stored in pct_sequence).[](#l1.79)

```
           continue[](#l1.80)
```
```
   except ValueError:[](#l1.81)
```
```
       rest = '%' + item[](#l1.82)
```

   # Encountered non-percent-encoded characters. Flush the current[](#l1.83)

```
   # pct_sequence.[](#l1.84)
```

   string += pct_sequence.decode(encoding, errors) + rest[](#l1.85)

```
   pct_sequence = b''[](#l1.86)
```
if pct_sequence:

   # Flush the final pct_sequence[](#l1.88)

   string += pct_sequence.decode(encoding, errors)[](#l1.89)

return string

bits = _asciire.split(string)
res = [bits[0]]
append = res.append
for i in range(1, len(bits), 2):

   append(unquote_to_bytes(bits[i]).decode(encoding, errors))[](#l1.95)

```
   append(bits[i + 1])[](#l1.96)
```
return ''.join(res)

def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'):

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -233,6 +233,9 @@ Core and Builtins Library ------- +- Issue #1285086: Get rid of the refcounting hack and speed up

urllib.parse.unquote() and urllib.parse.unquote_to_bytes(). +

Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused a failure while decoding empty object literals when object_pairs_hook was specified.

cpython: 3cb07925fcb9 (original) (raw)

Mercurial > cpython

changeset 82659:3cb07925fcb9 3.2

pct_sequence: contiguous sequence of percent-encoded bytes, decoded