gh-102153: Start stripping C0 control and space chars in urlsplit (… · DataDog/cpython@2f630e1 (original) (raw)

`@@ -654,14 +654,73 @@ def test_urlsplit_remove_unsafe_bytes(self):

`

654

654

`self.assertEqual(p.scheme, "http")

`

655

655

`self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")

`

656

656

``

``

657

`+

def test_urlsplit_strip_url(self):

`

``

658

`+

noise = bytes(range(0, 0x20 + 1))

`

``

659

`+

base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"

`

``

660

+

``

661

`+

url = noise.decode("utf-8") + base_url

`

``

662

`+

p = urllib.parse.urlsplit(url)

`

``

663

`+

self.assertEqual(p.scheme, "http")

`

``

664

`+

self.assertEqual(p.netloc, "User:Pass@www.python.org:080")

`

``

665

`+

self.assertEqual(p.path, "/doc/")

`

``

666

`+

self.assertEqual(p.query, "query=yes")

`

``

667

`+

self.assertEqual(p.fragment, "frag")

`

``

668

`+

self.assertEqual(p.username, "User")

`

``

669

`+

self.assertEqual(p.password, "Pass")

`

``

670

`+

self.assertEqual(p.hostname, "www.python.org")

`

``

671

`+

self.assertEqual(p.port, 80)

`

``

672

`+

self.assertEqual(p.geturl(), base_url)

`

``

673

+

``

674

`+

url = noise + base_url.encode("utf-8")

`

``

675

`+

p = urllib.parse.urlsplit(url)

`

``

676

`+

self.assertEqual(p.scheme, b"http")

`

``

677

`+

self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")

`

``

678

`+

self.assertEqual(p.path, b"/doc/")

`

``

679

`+

self.assertEqual(p.query, b"query=yes")

`

``

680

`+

self.assertEqual(p.fragment, b"frag")

`

``

681

`+

self.assertEqual(p.username, b"User")

`

``

682

`+

self.assertEqual(p.password, b"Pass")

`

``

683

`+

self.assertEqual(p.hostname, b"www.python.org")

`

``

684

`+

self.assertEqual(p.port, 80)

`

``

685

`+

self.assertEqual(p.geturl(), base_url.encode("utf-8"))

`

``

686

+

``

687

`+

Test that trailing space is preserved as some applications rely on

`

``

688

`+

this within query strings.

`

``

689

`+

query_spaces_url = "https://www.python.org:88/doc/?query= "

`

``

690

`+

p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url)

`

``

691

`+

self.assertEqual(p.scheme, "https")

`

``

692

`+

self.assertEqual(p.netloc, "www.python.org:88")

`

``

693

`+

self.assertEqual(p.path, "/doc/")

`

``

694

`+

self.assertEqual(p.query, "query= ")

`

``

695

`+

self.assertEqual(p.port, 88)

`

``

696

`+

self.assertEqual(p.geturl(), query_spaces_url)

`

``

697

+

``

698

`+

p = urllib.parse.urlsplit("www.pypi.org ")

`

``

699

`+

That "hostname" gets considered a "path" due to the

`

``

700

`+

trailing space and our existing logic... YUCK...

`

``

701

`+

and re-assembles via geturl aka unurlsplit into the original.

`

``

702

`+

django.core.validators.URLValidator (at least through v3.2) relies on

`

``

703

`+

this, for better or worse, to catch it in a ValidationError via its

`

``

704

`+

regular expressions.

`

``

705

`+

Here we test the basic round trip concept of such a trailing space.

`

``

706

`+

self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ")

`

``

707

+

``

708

`+

with scheme as cache-key

`

``

709

`+

url = "//www.python.org/"

`

``

710

`+

scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")

`

``

711

`+

for _ in range(2):

`

``

712

`+

p = urllib.parse.urlsplit(url, scheme=scheme)

`

``

713

`+

self.assertEqual(p.scheme, "https")

`

``

714

`+

self.assertEqual(p.geturl(), "https://www.python.org/")

`

``

715

+

657

716

`def test_attributes_bad_port(self):

`

658

717

`"""Check handling of invalid ports."""

`

659

718

`for bytes in (False, True):

`

660

719

`for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):

`

661

720

`for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):

`

662

721

`with self.subTest(bytes=bytes, parse=parse, port=port):

`

663

722

`netloc = "www.example.net:" + port

`

664

``

`-

url = "http://" + netloc

`

``

723

`+

url = "http://" + netloc + "/"

`

665

724

`if bytes:

`

666

725

`if netloc.isascii() and port.isascii():

`

667

726

`netloc = netloc.encode("ascii")

`