[3.11] gh-102153: Start stripping C0 control and space chars in `urls… · DataDog/cpython@610cc0a (original) (raw)
`@@ -649,14 +649,73 @@ def test_urlsplit_remove_unsafe_bytes(self):
`
649
649
`self.assertEqual(p.scheme, "http")
`
650
650
`self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
`
651
651
``
``
652
`+
def test_urlsplit_strip_url(self):
`
``
653
`+
noise = bytes(range(0, 0x20 + 1))
`
``
654
`+
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
`
``
655
+
``
656
`+
url = noise.decode("utf-8") + base_url
`
``
657
`+
p = urllib.parse.urlsplit(url)
`
``
658
`+
self.assertEqual(p.scheme, "http")
`
``
659
`+
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
`
``
660
`+
self.assertEqual(p.path, "/doc/")
`
``
661
`+
self.assertEqual(p.query, "query=yes")
`
``
662
`+
self.assertEqual(p.fragment, "frag")
`
``
663
`+
self.assertEqual(p.username, "User")
`
``
664
`+
self.assertEqual(p.password, "Pass")
`
``
665
`+
self.assertEqual(p.hostname, "www.python.org")
`
``
666
`+
self.assertEqual(p.port, 80)
`
``
667
`+
self.assertEqual(p.geturl(), base_url)
`
``
668
+
``
669
`+
url = noise + base_url.encode("utf-8")
`
``
670
`+
p = urllib.parse.urlsplit(url)
`
``
671
`+
self.assertEqual(p.scheme, b"http")
`
``
672
`+
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
`
``
673
`+
self.assertEqual(p.path, b"/doc/")
`
``
674
`+
self.assertEqual(p.query, b"query=yes")
`
``
675
`+
self.assertEqual(p.fragment, b"frag")
`
``
676
`+
self.assertEqual(p.username, b"User")
`
``
677
`+
self.assertEqual(p.password, b"Pass")
`
``
678
`+
self.assertEqual(p.hostname, b"www.python.org")
`
``
679
`+
self.assertEqual(p.port, 80)
`
``
680
`+
self.assertEqual(p.geturl(), base_url.encode("utf-8"))
`
``
681
+
``
682
`+
Test that trailing space is preserved as some applications rely on
`
``
683
`+
this within query strings.
`
``
684
`+
query_spaces_url = "https://www.python.org:88/doc/?query= "
`
``
685
`+
p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url)
`
``
686
`+
self.assertEqual(p.scheme, "https")
`
``
687
`+
self.assertEqual(p.netloc, "www.python.org:88")
`
``
688
`+
self.assertEqual(p.path, "/doc/")
`
``
689
`+
self.assertEqual(p.query, "query= ")
`
``
690
`+
self.assertEqual(p.port, 88)
`
``
691
`+
self.assertEqual(p.geturl(), query_spaces_url)
`
``
692
+
``
693
`+
p = urllib.parse.urlsplit("www.pypi.org ")
`
``
694
`+
That "hostname" gets considered a "path" due to the
`
``
695
`+
trailing space and our existing logic... YUCK...
`
``
696
`+
and re-assembles via geturl aka unurlsplit into the original.
`
``
697
`+
django.core.validators.URLValidator (at least through v3.2) relies on
`
``
698
`+
this, for better or worse, to catch it in a ValidationError via its
`
``
699
`+
regular expressions.
`
``
700
`+
Here we test the basic round trip concept of such a trailing space.
`
``
701
`+
self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ")
`
``
702
+
``
703
`+
with scheme as cache-key
`
``
704
`+
url = "//www.python.org/"
`
``
705
`+
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
`
``
706
`+
for _ in range(2):
`
``
707
`+
p = urllib.parse.urlsplit(url, scheme=scheme)
`
``
708
`+
self.assertEqual(p.scheme, "https")
`
``
709
`+
self.assertEqual(p.geturl(), "https://www.python.org/")
`
``
710
+
652
711
`def test_attributes_bad_port(self):
`
653
712
`"""Check handling of invalid ports."""
`
654
713
`for bytes in (False, True):
`
655
714
`for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):
`
656
715
`for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
`
657
716
`with self.subTest(bytes=bytes, parse=parse, port=port):
`
658
717
`netloc = "www.example.net:" + port
`
659
``
`-
url = "http://" + netloc
`
``
718
`+
url = "http://" + netloc + "/"
`
660
719
`if bytes:
`
661
720
`if netloc.isascii() and port.isascii():
`
662
721
`netloc = netloc.encode("ascii")
`