From 018d01a03932f96b4f2844178485a4c46f83150e Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 20 Oct 2023 14:15:04 +0200 Subject: [PATCH] IRI normalization: include path and fragment (#58) --- courlan/clean.py | 6 +++--- tests/unit_tests.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/courlan/clean.py b/courlan/clean.py index 9fa7283..a2b25d5 100644 --- a/courlan/clean.py +++ b/courlan/clean.py @@ -9,7 +9,7 @@ import re from typing import Optional, Union -from urllib.parse import parse_qs, urlencode, urlunsplit, SplitResult +from urllib.parse import parse_qs, quote, urlencode, urlunsplit, SplitResult from .filters import validate_url from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN @@ -166,12 +166,12 @@ def normalize_url( netloc = decode_punycode(netloc.lower()) # path: https://github.com/saintamh/alcazar/blob/master/alcazar/utils/urls.py # leading /../'s in the path are removed - newpath = PATH2.sub("", PATH1.sub("/", parsed_url.path)) + newpath = quote(PATH2.sub("", PATH1.sub("/", parsed_url.path))) # strip unwanted query elements newquery = clean_query(parsed_url, strict, language) or "" if newquery and newpath == "": newpath = "/" # fragment - newfragment = "" if strict else parsed_url.fragment + newfragment = "" if strict else quote(parsed_url.fragment) # rebuild return urlunsplit([scheme, netloc, newpath, newquery, newfragment]) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 0e3281a..8e8ab10 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -147,7 +147,7 @@ def test_fix_relative(): def test_scrub(): # clean: scrub + normalize assert clean_url(5) is None - assert clean_url("ø\xaa") == "øª" + assert clean_url("ø\xaa") == "%C3%B8%C2%AA" assert clean_url("https://example.org/?p=100") == "https://example.org/?p=100" assert ( clean_url("https://example.org:443/file.html?p=100&abc=1#frag")