parse.py - Issue 2827: [issue3300] urllib.quote and unquote Code Review (original) (raw)

OLD

NEW

1 """Parse (absolute and relative) URLs.

3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,

4 UC Irvine, June 1995.

5 """

7 import sys

9 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

10 "urlsplit", "urlunsplit"]

12 # A classification of schemes ('' means apply by default)

13 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',

14 'wais', 'file', 'https', 'shttp', 'mms',

15 'prospero', 'rtsp', 'rtspu', '', 'sftp']

16 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',

17 'imap', 'wais', 'file', 'mms', 'https', 'shttp',

18 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

19 'svn', 'svn+ssh', 'sftp']

20 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

(...skipping 232 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading...

253 the URL contained no fragments, the second element is the

254 empty string.

255 """

256 if '#' in url:

257 s, n, p, a, q, frag = urlparse(url)

258 defrag = urlunparse((s, n, p, a, q, ''))

259 return defrag, frag

260 else:

261 return url, ''

262

263 # _hextochr maps 2-hex-digit strings onto single bytes

264 # eg. _hextochr['2f'] = b'\x2f'

265 # Maps lowercase and uppercase variants (but not mixed case).

266 _hextochr = dict(('%02x' % i, bytes([i])) for i in range(256))

267 _hextochr.update(('%02X' % i, bytes([i])) for i in range(256))

263

268

264 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))

269 def unquote_to_bytes(s):

265 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))

270 """unquote_to_bytes('abc%20def') -> b'abc def'."""

271 # Note: strings are encoded as UTF-8. This is only an issue if it contains

272 # unescaped non-ASCII characters, which URIs should not.

273 res = s.split('%')

274 res[0] = res[0].encode('utf-8')

275 for i in range(1, len(res)):

276 item = res[i]

277 try:

278 res[i] = _hextochr[item[:2]] + item[2:].encode('utf-8')

279 except KeyError:

280 res[i] = b'%' + item.encode('utf-8')

281 return b"".join(res)

266

282

267 def unquote(s):

283 def unquote(s, encoding = "utf-8", errors = "replace"):

268 """unquote('abc%20def') -> 'abc def'."""

284 """Replace %xx escapes by their single-character equivalent. The optional

285 encoding and errors parameters specify how to decode percent-encoded

286 sequences into Unicode characters, as accepted by the bytes.decode()

287 method.

288 By default, percent-encoded sequences are decoded with UTF-8, and invalid

289 sequences are replaced by a placeholder character.

290

291 unquote('abc%20def') -> 'abc def'.

292 """

293 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded

294 # (list of single-byte bytes objects)

295 pct_sequence = []

269 res = s.split('%')

296 res = s.split('%')

270 for i in range(1, len(res)):

297 for i in range(1, len(res)):

271 item = res[i]

298 item = res[i]

272 try:

299 try:

273 res[i] = _hextochr[item[:2]] + item[2:]

300 pct_sequence.append(_hextochr[item[:2]])

301 rest = item[2:]

274 except KeyError:

302 except KeyError:

275 res[i] = '%' + item

303 rest = '%' + item

276 except UnicodeDecodeError:

304 if len(rest) == 0:

277 res[i] = chr(int(item[:2], 16)) + item[2:]

305 # This segment was just a single percent-encoded character.

306 # May be part of a sequence of code units, so delay decoding.

307 # (Stored in pct_sequence).

308 res[i] = ''

309 else:

310 # Encountered non-percent-encoded characters. Flush the current

311 # pct_sequence.

312 res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest

313 pct_sequence = []

314 if len(pct_sequence) > 0:

315 # Flush the final pct_sequence

316 # res[-1] will always be empty if pct_sequence != []

317 res[-1] = b''.join(pct_sequence).decode(encoding, errors)

278 return "".join(res)

318 return "".join(res)

279

319

280 def unquote_plus(s):

320 def unquote_plus(s, encoding = "utf-8", errors = "replace"):

281 """unquote('%7e/abc+def') -> '~/abc def'"""

321 """Like unquote(), but also replace plus signs by spaces, as required for

322 unquoting HTML form values.

323

324 unquote_plus('%7e/abc+def') -> '~/abc def'

325 """

282 s = s.replace('+', ' ')

326 s = s.replace('+', ' ')

283 return unquote(s)

327 return unquote(s, encoding, errors)

284

328

285 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

329 always_safe = frozenset(

286 'abcdefghijklmnopqrstuvwxyz'

330 b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

287 '0123456789' '_.-')

331 b'abcdefghijklmnopqrstuvwxyz'

332 b'0123456789' b'_.-')

288 _safe_quoters= {}

333 _safe_quoters= {}

289

334

290 class Quoter:

335 class Quoter:

291 def __init__(self, safe):

336 def __init__(self, safe):

337 """safe: May be either a string or bytes object."""

292 self.cache = {}

338 self.cache = {}

293 self.safe = safe + always_safe

339 # safe is a bytes object

340 self.safe = always_safe.union(c for c in safe if c < 128)

294

341

295 def __call__(self, c):

342 def __call__(self, c):

343 """

344 c: An int, representing a byte to be encoded. Must have range(0,256).

345 Returns a str.

346 """

296 try:

347 try:

297 return self.cache[c]

348 return self.cache[c]

298 except KeyError:

349 except KeyError:

299 if ord(c) < 256:

350 res = c in self.safe and chr(c) or ('%%%02X' % c)

300 res = (c in self.safe) and c or ('%%%02X' % ord(c))

351 self.cache[c] = res

301 self.cache[c] = res

352 return res

302 return res

303 else:

304 return "".join(['%%%02X' % i for i in c.encode("utf-8")])

305

353

306 def quote(s, safe = '/'):

354 def quote(s, safe = '/', encoding = "utf-8", errors = "replace"):

307 """quote('abc def') -> 'abc%20def'

355 """quote('abc def') -> 'abc%20def'

308

356

309 Each part of a URL, e.g. the path info, the query, etc., has a

357 Each part of a URL, e.g. the path info, the query, etc., has a

310 different set of reserved characters that must be quoted.

358 different set of reserved characters that must be quoted.

311

359

312 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

360 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists

313 the following reserved characters.

361 the following reserved characters.

314

362

315 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

363 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |

316 "$" | ","

364 "$" | ","

317

365

318 Each of these characters is reserved in some component of a URL,

366 Each of these characters is reserved in some component of a URL,

319 but not necessarily in all of them.

367 but not necessarily in all of them.

320

368

321 By default, the quote function is intended for quoting the path

369 By default, the quote function is intended for quoting the path

322 section of a URL. Thus, it will not encode '/'. This character

370 section of a URL. Thus, it will not encode '/'. This character

323 is reserved, but in typical usage the quote function is being

371 is reserved, but in typical usage the quote function is being

324 called on a path where the existing slash characters are used as

372 called on a path where the existing slash characters are used as

325 reserved characters.

373 reserved characters.

374

375 The optional encoding and errors parameters specify how to deal with

376 non-ASCII characters, as accepted by the str.encode method.

377 By default, characters are encoded with UTF-8, and unsupported characters

378 are replaced by a placeholder character.

326 """

379 """

380 if isinstance(safe, str):

381 # Normalize 'safe' by converting to bytes and removing non-ASCII chars

382 safe = safe.encode('ascii', 'ignore')

327 cachekey = (safe, always_safe)

383 cachekey = (safe, always_safe)

384 if isinstance(s, str):

385 s = s.encode(encoding, errors)

328 try:

386 try:

329 quoter = _safe_quoters[cachekey]

387 quoter = _safe_quoters[cachekey]

330 except KeyError:

388 except KeyError:

331 quoter = Quoter(safe)

389 quoter = Quoter(safe)

332 _safe_quoters[cachekey] = quoter

390 _safe_quoters[cachekey] = quoter

333 res = map(quoter, s)

391 res = map(quoter, s)

334 return ''.join(res)

392 return ''.join(res)

335

393

336 def quote_plus(s, safe = ''):

394 def quote_plus(s, safe = '', encoding = "utf-8", errors = "replace"):

337 """Quote the query fragment of a URL; replacing ' ' with '+'"""

395 """Like quote(), but also replace ' ' with '+', as required for quoting

396 HTML form values. Plus signs in the original string are escaped unless

397 they are included in safe. It also does not have safe default to '/'.

398 """

338 if ' ' in s:

399 if ' ' in s:

339 s = quote(s, safe + ' ')

400 s = quote(s, safe + ' ')

340 return s.replace(' ', '+')

401 return s.replace(' ', '+')

341 return quote(s, safe)

402 return quote(s, safe, encoding, errors)

403

404 # quote accepts either bytes or strings, so quote_from_bytes is just an alias

405 quote_from_bytes = quote

342

406

343 def urlencode(query,doseq=0):

407 def urlencode(query,doseq=0):

344 """Encode a sequence of two-element tuples or dictionary into a URL query st ring.

408 """Encode a sequence of two-element tuples or dictionary into a URL query st ring.

345

409

346 If any values in the query arg are sequences and doseq is true, each

410 If any values in the query arg are sequences and doseq is true, each

347 sequence element is converted to a separate parameter.

411 sequence element is converted to a separate parameter.

348

412

349 If the query arg is a sequence of two-element tuples, the order of the

413 If the query arg is a sequence of two-element tuples, the order of the

350 parameters in the output will match the order of parameters in the

414 parameters in the output will match the order of parameters in the

351 input.

415 input.

(...skipping 270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading...

622 if not base:

686 if not base:

623 base = abs

687 base = abs

624 wrapped = 'URL:%s' % abs

688 wrapped = 'URL:%s' % abs

625 print('%-10s = %s' % (url, wrapped))

689 print('%-10s = %s' % (url, wrapped))

626 if len(words) == 3 and words[1] == '=':

690 if len(words) == 3 and words[1] == '=':

627 if wrapped != words[2]:

691 if wrapped != words[2]:

628 print('EXPECTED', words[2], '!!!!!!!!!!')

692 print('EXPECTED', words[2], '!!!!!!!!!!')

629

693

630 if __name__ == '__main__':

694 if __name__ == '__main__':

631 test()

695 test()

OLD

NEW