Skip to content

Commit

Permalink
IRI normalization: include path and fragment (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Oct 20, 2023
1 parent 869912c commit 018d01a
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 4 deletions.
6 changes: 3 additions & 3 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import re

from typing import Optional, Union
from urllib.parse import parse_qs, urlencode, urlunsplit, SplitResult
from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult

from .filters import validate_url
from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
Expand Down Expand Up @@ -166,12 +166,12 @@ def normalize_url(
netloc = decode_punycode(netloc.lower())
# path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py
# leading /../'s in the path are removed
newpath = PATH2.sub("", PATH1.sub("/", parsed_url.path))
newpath = quote(PATH2.sub("", PATH1.sub("/", parsed_url.path)))
# strip unwanted query elements
newquery = clean_query(parsed_url, strict, language) or ""
if newquery and newpath == "":
newpath = "/"
# fragment
newfragment = "" if strict else parsed_url.fragment
newfragment = "" if strict else quote(parsed_url.fragment)
# rebuild
return urlunsplit([scheme, netloc, newpath, newquery, newfragment])
2 changes: 1 addition & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def test_fix_relative():
def test_scrub():
# clean: scrub + normalize
assert clean_url(5) is None
assert clean_url(\xaa") == "øª"
assert clean_url(\xaa") == "%C3%B8%C2%AA"
assert clean_url("https://example.org/?p=100") == "https://example.org/?p=100"
assert (
clean_url("https://example.org:443/file.html?p=100&abc=1#frag")
Expand Down

0 comments on commit 018d01a

Please sign in to comment.