gh-102153: Start stripping C0 control and space chars in urlsplit (… · DataDog/cpython@2f630e1 (original) (raw)
`@@ -654,14 +654,73 @@ def test_urlsplit_remove_unsafe_bytes(self):
`
654
654
`self.assertEqual(p.scheme, "http")
`
655
655
`self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/?query=something#fragment")
`
656
656
``
``
657
`+
def test_urlsplit_strip_url(self):
`
``
658
`+
noise = bytes(range(0, 0x20 + 1))
`
``
659
`+
base_url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
`
``
660
+
``
661
`+
url = noise.decode("utf-8") + base_url
`
``
662
`+
p = urllib.parse.urlsplit(url)
`
``
663
`+
self.assertEqual(p.scheme, "http")
`
``
664
`+
self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
`
``
665
`+
self.assertEqual(p.path, "/doc/")
`
``
666
`+
self.assertEqual(p.query, "query=yes")
`
``
667
`+
self.assertEqual(p.fragment, "frag")
`
``
668
`+
self.assertEqual(p.username, "User")
`
``
669
`+
self.assertEqual(p.password, "Pass")
`
``
670
`+
self.assertEqual(p.hostname, "www.python.org")
`
``
671
`+
self.assertEqual(p.port, 80)
`
``
672
`+
self.assertEqual(p.geturl(), base_url)
`
``
673
+
``
674
`+
url = noise + base_url.encode("utf-8")
`
``
675
`+
p = urllib.parse.urlsplit(url)
`
``
676
`+
self.assertEqual(p.scheme, b"http")
`
``
677
`+
self.assertEqual(p.netloc, b"User:Pass@www.python.org:080")
`
``
678
`+
self.assertEqual(p.path, b"/doc/")
`
``
679
`+
self.assertEqual(p.query, b"query=yes")
`
``
680
`+
self.assertEqual(p.fragment, b"frag")
`
``
681
`+
self.assertEqual(p.username, b"User")
`
``
682
`+
self.assertEqual(p.password, b"Pass")
`
``
683
`+
self.assertEqual(p.hostname, b"www.python.org")
`
``
684
`+
self.assertEqual(p.port, 80)
`
``
685
`+
self.assertEqual(p.geturl(), base_url.encode("utf-8"))
`
``
686
+
``
687
`+
Test that trailing space is preserved as some applications rely on
`
``
688
`+
this within query strings.
`
``
689
`+
query_spaces_url = "https://www.python.org:88/doc/?query= "
`
``
690
`+
p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url)
`
``
691
`+
self.assertEqual(p.scheme, "https")
`
``
692
`+
self.assertEqual(p.netloc, "www.python.org:88")
`
``
693
`+
self.assertEqual(p.path, "/doc/")
`
``
694
`+
self.assertEqual(p.query, "query= ")
`
``
695
`+
self.assertEqual(p.port, 88)
`
``
696
`+
self.assertEqual(p.geturl(), query_spaces_url)
`
``
697
+
``
698
`+
p = urllib.parse.urlsplit("www.pypi.org ")
`
``
699
`+
That "hostname" gets considered a "path" due to the
`
``
700
`+
trailing space and our existing logic... YUCK...
`
``
701
`+
and re-assembles via geturl aka unurlsplit into the original.
`
``
702
`+
django.core.validators.URLValidator (at least through v3.2) relies on
`
``
703
`+
this, for better or worse, to catch it in a ValidationError via its
`
``
704
`+
regular expressions.
`
``
705
`+
Here we test the basic round trip concept of such a trailing space.
`
``
706
`+
self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ")
`
``
707
+
``
708
`+
with scheme as cache-key
`
``
709
`+
url = "//www.python.org/"
`
``
710
`+
scheme = noise.decode("utf-8") + "https" + noise.decode("utf-8")
`
``
711
`+
for _ in range(2):
`
``
712
`+
p = urllib.parse.urlsplit(url, scheme=scheme)
`
``
713
`+
self.assertEqual(p.scheme, "https")
`
``
714
`+
self.assertEqual(p.geturl(), "https://www.python.org/")
`
``
715
+
657
716
`def test_attributes_bad_port(self):
`
658
717
`"""Check handling of invalid ports."""
`
659
718
`for bytes in (False, True):
`
660
719
`for parse in (urllib.parse.urlsplit, urllib.parse.urlparse):
`
661
720
`for port in ("foo", "1.5", "-1", "0x10", "-0", "1_1", " 1", "1 ", "६"):
`
662
721
`with self.subTest(bytes=bytes, parse=parse, port=port):
`
663
722
`netloc = "www.example.net:" + port
`
664
``
`-
url = "http://" + netloc
`
``
723
`+
url = "http://" + netloc + "/"
`
665
724
`if bytes:
`
666
725
`if netloc.isascii() and port.isascii():
`
667
726
`netloc = netloc.encode("ascii")
`