fix: IRI to URI conversion (#2304) · RDFLib/rdflib@dfa4054 (original) (raw)

`@@ -522,32 +522,92 @@ def _coalesce(

`

522

522

`return default

`

523

523

``

524

524

``

``

525

`+

_RFC3986_SUBDELIMS = "!$&'()*+,;="

`

``

526

`+

"""

`

``

527


``sub-delims`` production from `RFC 3986, section 2.2

``

528

`` +

https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2`_.

``

``

529

`+

"""

`

``

530

+

``

531

`+

_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"

`

``

532

`+

"""

`

``

533


The non-unreserved characters in the ``pchar`` production from RFC 3986.

``

534

`+

"""

`

``

535

+

``

536

`+

_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"

`

``

537

`+

"""

`

``

538

`+

The non-unreserved characters that are safe to use in in the query and fragment

`

``

539

`+

components.

`

``

540

+

``

541

`+

.. code-block::

`

``

542

+

``

543

`+

pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query

`

``

544

`+

= *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )

`

``

545

`+

"""

`

``

546

+

``

547

`+

_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"

`

``

548

`+

"""

`

``

549

`+

The non-unreserved characters that are safe to use in the username and password

`

``

550

`+

components.

`

``

551

+

``

552

`+

.. code-block::

`

``

553

+

``

554

`+

userinfo = *( unreserved / pct-encoded / sub-delims / ":" )

`

``

555

+

``

556

`+

":" is excluded as this is only used for the username and password components,

`

``

557

`+

and they are treated separately.

`

``

558

`+

"""

`

``

559

+

``

560

`+

_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"

`

``

561

`+

"""

`

``

562

`+

The non-unreserved characters that are safe to use in the path component.

`

``

563

+

``

564

+

``

565

`+

This is based on various path-related productions from RFC 3986.

`

``

566

`+

"""

`

``

567

+

``

568

+

525

569

`def _iri2uri(iri: str) -> str:

`

526

570

`"""

`

527

``

`-

Convert an IRI to a URI (Python 3).

`

528

``

`-

https://stackoverflow.com/a/42309027

`

529

``

`-

https://stackoverflow.com/a/40654295

`

530

``

`-

netloc should be encoded using IDNA;

`

531

``

`-

non-ascii URL path should be encoded to UTF-8 and then percent-escaped;

`

532

``

`-

non-ascii query parameters should be encoded to the encoding of a page

`

533

``

`-

URL was extracted from (or to the encoding server uses), then

`

534

``

`-

percent-escaped.

`

``

571

`+

Prior art:

`

``

572

+

``

573

`` +

``

``

574

+

535

575

` >>> _iri2uri("https://dbpedia.org/resource/Almería")

`

536

576

` 'https://dbpedia.org/resource/Almer%C3%ADa'

`

537

577

` """

`

``

578

`+

https://datatracker.ietf.org/doc/html/rfc3986

`

538

579

`# https://datatracker.ietf.org/doc/html/rfc3305

`

539

580

``

540

``

`-

(scheme, netloc, path, query, fragment) = urlsplit(iri)

`

``

581

`+

parts = urlsplit(iri)

`

``

582

`+

(scheme, netloc, path, query, fragment) = parts

`

541

583

``

542

``

`-

Just support http/https, otherwise return the iri unmolested

`

``

584

`+

Just support http/https, otherwise return the iri unaltered

`

543

585

`if scheme not in ["http", "https"]:

`

544

586

`return iri

`

545

587

``

546

``

`-

scheme = quote(scheme)

`

547

``

`-

netloc = netloc.encode("idna").decode("utf-8")

`

548

``

`-

path = quote(path)

`

549

``

`-

query = quote(query)

`

550

``

`-

fragment = quote(fragment)

`

``

588

`+

path = quote(path, safe=_PATH_SAFE_CHARS)

`

``

589

`+

query = quote(query, safe=_QUERY_SAFE_CHARS)

`

``

590

`+

fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)

`

``

591

+

``

592

`+

if parts.hostname:

`

``

593

`+

netloc = parts.hostname.encode("idna").decode("ascii")

`

``

594

`+

else:

`

``

595

`+

netloc = ""

`

``

596

+

``

597

`+

if ":" in netloc:

`

``

598

`+

Quote IPv6 addresses

`

``

599

`+

netloc = f"[{netloc}]"

`

``

600

+

``

601

`+

if parts.port:

`

``

602

`+

netloc = f"{netloc}:{parts.port}"

`

``

603

+

``

604

`+

if parts.username:

`

``

605

`+

auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)

`

``

606

`+

if parts.password:

`

``

607

`+

pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)

`

``

608

`+

auth = f"{auth}:{pass_quoted}"

`

``

609

`+

netloc = f"{auth}@{netloc}"

`

``

610

+

551

611

`uri = urlunsplit((scheme, netloc, path, query, fragment))

`

552

612

``

553

613

`if iri.endswith("#") and not uri.endswith("#"):

`