fix: IRI to URI conversion (#2304) · RDFLib/rdflib@dfa4054 (original) (raw)
`@@ -522,32 +522,92 @@ def _coalesce(
`
522
522
`return default
`
523
523
``
524
524
``
``
525
`+
_RFC3986_SUBDELIMS = "!$&'()*+,;="
`
``
526
`+
"""
`
``
527
``sub-delims`` production from `RFC 3986, section 2.2
``
528
`` +
https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2`_.
``
``
529
`+
"""
`
``
530
+
``
531
`+
_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"
`
``
532
`+
"""
`
``
533
The non-unreserved characters in the ``pchar`` production from RFC 3986.
``
534
`+
"""
`
``
535
+
``
536
`+
_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"
`
``
537
`+
"""
`
``
538
`+
The non-unreserved characters that are safe to use in in the query and fragment
`
``
539
`+
components.
`
``
540
+
``
541
`+
.. code-block::
`
``
542
+
``
543
`+
pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query
`
``
544
`+
= *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )
`
``
545
`+
"""
`
``
546
+
``
547
`+
_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"
`
``
548
`+
"""
`
``
549
`+
The non-unreserved characters that are safe to use in the username and password
`
``
550
`+
components.
`
``
551
+
``
552
`+
.. code-block::
`
``
553
+
``
554
`+
userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
`
``
555
+
``
556
`+
":" is excluded as this is only used for the username and password components,
`
``
557
`+
and they are treated separately.
`
``
558
`+
"""
`
``
559
+
``
560
`+
_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"
`
``
561
`+
"""
`
``
562
`+
The non-unreserved characters that are safe to use in the path component.
`
``
563
+
``
564
+
``
565
`+
This is based on various path-related productions from RFC 3986.
`
``
566
`+
"""
`
``
567
+
``
568
+
525
569
`def _iri2uri(iri: str) -> str:
`
526
570
`"""
`
527
``
`-
Convert an IRI to a URI (Python 3).
`
528
``
`-
https://stackoverflow.com/a/42309027
`
529
``
`-
https://stackoverflow.com/a/40654295
`
530
``
`-
netloc should be encoded using IDNA;
`
531
``
`-
non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
`
532
``
`-
non-ascii query parameters should be encoded to the encoding of a page
`
533
``
`-
URL was extracted from (or to the encoding server uses), then
`
534
``
`-
percent-escaped.
`
``
571
`+
Prior art:
`
``
572
+
``
573
`` +
iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>
_
``
``
574
+
535
575
` >>> _iri2uri("https://dbpedia.org/resource/Almería")
`
536
576
` 'https://dbpedia.org/resource/Almer%C3%ADa'
`
537
577
` """
`
``
578
`+
https://datatracker.ietf.org/doc/html/rfc3986
`
538
579
`# https://datatracker.ietf.org/doc/html/rfc3305
`
539
580
``
540
``
`-
(scheme, netloc, path, query, fragment) = urlsplit(iri)
`
``
581
`+
parts = urlsplit(iri)
`
``
582
`+
(scheme, netloc, path, query, fragment) = parts
`
541
583
``
542
``
`-
Just support http/https, otherwise return the iri unmolested
`
``
584
`+
Just support http/https, otherwise return the iri unaltered
`
543
585
`if scheme not in ["http", "https"]:
`
544
586
`return iri
`
545
587
``
546
``
`-
scheme = quote(scheme)
`
547
``
`-
netloc = netloc.encode("idna").decode("utf-8")
`
548
``
`-
path = quote(path)
`
549
``
`-
query = quote(query)
`
550
``
`-
fragment = quote(fragment)
`
``
588
`+
path = quote(path, safe=_PATH_SAFE_CHARS)
`
``
589
`+
query = quote(query, safe=_QUERY_SAFE_CHARS)
`
``
590
`+
fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)
`
``
591
+
``
592
`+
if parts.hostname:
`
``
593
`+
netloc = parts.hostname.encode("idna").decode("ascii")
`
``
594
`+
else:
`
``
595
`+
netloc = ""
`
``
596
+
``
597
`+
if ":" in netloc:
`
``
598
`+
Quote IPv6 addresses
`
``
599
`+
netloc = f"[{netloc}]"
`
``
600
+
``
601
`+
if parts.port:
`
``
602
`+
netloc = f"{netloc}:{parts.port}"
`
``
603
+
``
604
`+
if parts.username:
`
``
605
`+
auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)
`
``
606
`+
if parts.password:
`
``
607
`+
pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)
`
``
608
`+
auth = f"{auth}:{pass_quoted}"
`
``
609
`+
netloc = f"{auth}@{netloc}"
`
``
610
+
551
611
`uri = urlunsplit((scheme, netloc, path, query, fragment))
`
552
612
``
553
613
`if iri.endswith("#") and not uri.endswith("#"):
`