Lib/urllib/parse.py - Issue 2827: [issue3300] urllib.quote and unquote Code Review (original) (raw)
OLD
NEW
1 """Parse (absolute and relative) URLs.
1 """Parse (absolute and relative) URLs.
2
2
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4 UC Irvine, June 1995.
4 UC Irvine, June 1995.
5 """
5 """
6
6
7 import sys
7 import sys
8
8
9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
10 "urlsplit", "urlunsplit"]
10 "urlsplit", "urlunsplit"]
11
11
12 # A classification of schemes ('' means apply by default)
12 # A classification of schemes ('' means apply by default)
13 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
13 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
14 'wais', 'file', 'https', 'shttp', 'mms',
14 'wais', 'file', 'https', 'shttp', 'mms',
15 'prospero', 'rtsp', 'rtspu', '', 'sftp']
15 'prospero', 'rtsp', 'rtspu', '', 'sftp']
16 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
16 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
17 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
17 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
18 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
18 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
19 'svn', 'svn+ssh', 'sftp']
19 'svn', 'svn+ssh', 'sftp']
20 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
20 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading...
253 the URL contained no fragments, the second element is the
253 the URL contained no fragments, the second element is the
254 empty string.
254 empty string.
255 """
255 """
256 if '#' in url:
256 if '#' in url:
257 s, n, p, a, q, frag = urlparse(url)
257 s, n, p, a, q, frag = urlparse(url)
258 defrag = urlunparse((s, n, p, a, q, ''))
258 defrag = urlunparse((s, n, p, a, q, ''))
259 return defrag, frag
259 return defrag, frag
260 else:
260 else:
261 return url, ''
261 return url, ''
262
262
263 # _hextochr maps 2-hex-digit strings onto single bytes
264 # eg. _hextochr['2f'] = b'\x2f'
265 # Maps lowercase and uppercase variants (but not mixed case).
266 _hextochr = dict(('%02x' % i, bytes([i])) for i in range(256))
267 _hextochr.update(('%02X' % i, bytes([i])) for i in range(256))
263
268
264 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
269 def unquote_to_bytes(s):
265 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
270 """unquote_to_bytes('abc%20def') -> b'abc def'."""
271 # Note: strings are encoded as UTF-8. This is only an issue if it contains
272 # unescaped non-ASCII characters, which URIs should not.
273 res = s.split('%')
274 res[0] = res[0].encode('utf-8')
275 for i in range(1, len(res)):
276 item = res[i]
277 try:
278 res[i] = _hextochr[item[:2]] + item[2:].encode('utf-8')
279 except KeyError:
280 res[i] = b'%' + item.encode('utf-8')
281 return b"".join(res)
266
282
267 def unquote(s):
283 def unquote(s, encoding = "utf-8", errors = "replace"):
268 """unquote('abc%20def') -> 'abc def'."""
284 """Replace %xx escapes by their single-character equivalent. The optional
285 encoding and errors parameters specify how to decode percent-encoded
286 sequences into Unicode characters, as accepted by the bytes.decode()
287 method.
288 By default, percent-encoded sequences are decoded with UTF-8, and invalid
289 sequences are replaced by a placeholder character.
290
291 unquote('abc%20def') -> 'abc def'.
292 """
293 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
294 # (list of single-byte bytes objects)
295 pct_sequence = []
269 res = s.split('%')
296 res = s.split('%')
270 for i in range(1, len(res)):
297 for i in range(1, len(res)):
271 item = res[i]
298 item = res[i]
272 try:
299 try:
273 res[i] = _hextochr[item[:2]] + item[2:]
300 pct_sequence.append(_hextochr[item[:2]])
301 rest = item[2:]
274 except KeyError:
302 except KeyError:
275 res[i] = '%' + item
303 rest = '%' + item
276 except UnicodeDecodeError:
304 if len(rest) == 0:
277 res[i] = chr(int(item[:2], 16)) + item[2:]
305 # This segment was just a single percent-encoded character.
306 # May be part of a sequence of code units, so delay decoding.
307 # (Stored in pct_sequence).
308 res[i] = ''
309 else:
310 # Encountered non-percent-encoded characters. Flush the current
311 # pct_sequence.
312 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest
313 pct_sequence = []
314 if len(pct_sequence) > 0:
315 # Flush the final pct_sequence
316 # res[-1] will always be empty if pct_sequence != []
317 res[-1] = b''.join(pct_sequence).decode(encoding, errors)
278 return "".join(res)
318 return "".join(res)
279
319
280 def unquote_plus(s):
320 def unquote_plus(s, encoding = "utf-8", errors = "replace"):
281 """unquote('%7e/abc+def') -> '~/abc def'"""
321 """Like unquote(), but also replace plus signs by spaces, as required for
322 unquoting HTML form values.
323
324 unquote_plus('%7e/abc+def') -> '~/abc def'
325 """
282 s = s.replace('+', ' ')
326 s = s.replace('+', ' ')
283 return unquote(s)
327 return unquote(s, encoding, errors)
284
328
285 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
329 always_safe = frozenset(
286 'abcdefghijklmnopqrstuvwxyz'
330 b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
287 '0123456789' '_.-')
331 b'abcdefghijklmnopqrstuvwxyz'
332 b'0123456789' b'_.-')
288 _safe_quoters= {}
333 _safe_quoters= {}
289
334
290 class Quoter:
335 class Quoter:
291 def __init__(self, safe):
336 def __init__(self, safe):
337 """safe: May be either a string or bytes object."""
292 self.cache = {}
338 self.cache = {}
293 self.safe = safe + always_safe
339 # safe is a bytes object
340 self.safe = always_safe.union(c for c in safe if c < 128)
294
341
295 def __call__(self, c):
342 def __call__(self, c):
343 """
344 c: An int, representing a byte to be encoded. Must have range(0,256).
345 Returns a str.
346 """
296 try:
347 try:
297 return self.cache[c]
348 return self.cache[c]
298 except KeyError:
349 except KeyError:
299 if ord(c) < 256:
350 res = c in self.safe and chr(c) or ('%%%02X' % c)
300 res = (c in self.safe) and c or ('%%%02X' % ord(c))
351 self.cache[c] = res
301 self.cache[c] = res
352 return res
302 return res
303 else:
304 return "".join(['%%%02X' % i for i in c.encode("utf-8")])
305
353
306 def quote(s, safe = '/'):
354 def quote(s, safe = '/', encoding = "utf-8", errors = "replace"):
307 """quote('abc def') -> 'abc%20def'
355 """quote('abc def') -> 'abc%20def'
308
356
309 Each part of a URL, e.g. the path info, the query, etc., has a
357 Each part of a URL, e.g. the path info, the query, etc., has a
310 different set of reserved characters that must be quoted.
358 different set of reserved characters that must be quoted.
311
359
312 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
360 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
313 the following reserved characters.
361 the following reserved characters.
314
362
315 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
363 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
316 "$" | ","
364 "$" | ","
317
365
318 Each of these characters is reserved in some component of a URL,
366 Each of these characters is reserved in some component of a URL,
319 but not necessarily in all of them.
367 but not necessarily in all of them.
320
368
321 By default, the quote function is intended for quoting the path
369 By default, the quote function is intended for quoting the path
322 section of a URL. Thus, it will not encode '/'. This character
370 section of a URL. Thus, it will not encode '/'. This character
323 is reserved, but in typical usage the quote function is being
371 is reserved, but in typical usage the quote function is being
324 called on a path where the existing slash characters are used as
372 called on a path where the existing slash characters are used as
325 reserved characters.
373 reserved characters.
374
375 The optional encoding and errors parameters specify how to deal with
376 non-ASCII characters, as accepted by the str.encode method.
377 By default, characters are encoded with UTF-8, and unsupported characters
378 are replaced by a placeholder character.
326 """
379 """
380 if isinstance(safe, str):
381 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
382 safe = safe.encode('ascii', 'ignore')
327 cachekey = (safe, always_safe)
383 cachekey = (safe, always_safe)
384 if isinstance(s, str):
385 s = s.encode(encoding, errors)
328 try:
386 try:
329 quoter = _safe_quoters[cachekey]
387 quoter = _safe_quoters[cachekey]
330 except KeyError:
388 except KeyError:
331 quoter = Quoter(safe)
389 quoter = Quoter(safe)
332 _safe_quoters[cachekey] = quoter
390 _safe_quoters[cachekey] = quoter
333 res = map(quoter, s)
391 res = map(quoter, s)
334 return ''.join(res)
392 return ''.join(res)
335
393
336 def quote_plus(s, safe = ''):
394 def quote_plus(s, safe = '', encoding = "utf-8", errors = "replace"):
337 """Quote the query fragment of a URL; replacing ' ' with '+'"""
395 """Like quote(), but also replace ' ' with '+', as required for quoting
396 HTML form values. Plus signs in the original string are escaped unless
397 they are included in safe. It also does not have safe default to '/'.
398 """
338 if ' ' in s:
399 if ' ' in s:
339 s = quote(s, safe + ' ')
400 s = quote(s, safe + ' ')
340 return s.replace(' ', '+')
401 return s.replace(' ', '+')
341 return quote(s, safe)
402 return quote(s, safe, encoding, errors)
403
404 # quote accepts either bytes or strings, so quote_from_bytes is just an alias
405 quote_from_bytes = quote
342
406
343 def urlencode(query,doseq=0):
407 def urlencode(query,doseq=0):
344 """Encode a sequence of two-element tuples or dictionary into a URL query st ring.
408 """Encode a sequence of two-element tuples or dictionary into a URL query st ring.
345
409
346 If any values in the query arg are sequences and doseq is true, each
410 If any values in the query arg are sequences and doseq is true, each
347 sequence element is converted to a separate parameter.
411 sequence element is converted to a separate parameter.
348
412
349 If the query arg is a sequence of two-element tuples, the order of the
413 If the query arg is a sequence of two-element tuples, the order of the
350 parameters in the output will match the order of parameters in the
414 parameters in the output will match the order of parameters in the
351 input.
415 input.
(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading...
622 if not base:
686 if not base:
623 base = abs
687 base = abs
624 wrapped = 'URL:%s' % abs
688 wrapped = 'URL:%s' % abs
625 print('%-10s = %s' % (url, wrapped))
689 print('%-10s = %s' % (url, wrapped))
626 if len(words) == 3 and words[1] == '=':
690 if len(words) == 3 and words[1] == '=':
627 if wrapped != words[2]:
691 if wrapped != words[2]:
628 print('EXPECTED', words[2], '!!!!!!!!!!')
692 print('EXPECTED', words[2], '!!!!!!!!!!')
629
693
630 if __name__ == '__main__':
694 if __name__ == '__main__':
631 test()
695 test()
OLD
NEW