Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace remaining urlparse calls by urlsplit (2x faster) #50

Merged
merged 1 commit into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions courlan/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import re

from typing import Optional, Union
from urllib.parse import parse_qs, urlencode, urlunsplit, ParseResult
from urllib.parse import parse_qs, urlencode, urlunsplit, SplitResult

from .filters import validate_url
from .settings import ALLOWED_PARAMS, CONTROL_PARAMS, TARGET_LANG_DE, TARGET_LANG_EN
Expand Down Expand Up @@ -98,7 +98,7 @@ def scrub_url(url: str) -> str:


def clean_query(
parsed_url: ParseResult, strict: bool = False, language: Optional[str] = None
parsed_url: SplitResult, strict: bool = False, language: Optional[str] = None
) -> str:
"""Strip unwanted query elements"""
if len(parsed_url.query) > 0:
Expand Down Expand Up @@ -148,7 +148,7 @@ def decode_punycode(string: str) -> str:


def normalize_url(
parsed_url: Union[ParseResult, str],
parsed_url: Union[SplitResult, str],
strict: bool = False,
language: Optional[str] = None,
) -> str:
Expand Down
4 changes: 2 additions & 2 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import re

from typing import Any, Optional, Tuple
from urllib.parse import urlparse
from urllib.parse import urlsplit

from langcodes import Language, tag_is_valid

Expand Down Expand Up @@ -199,7 +199,7 @@ def type_filter(url: str, strict: bool = False, with_nav: bool = False) -> bool:
def validate_url(url: Optional[str]) -> Tuple[bool, Any]:
"""Parse and validate the input"""
try:
parsed_url = urlparse(url)
parsed_url = urlsplit(url)
except ValueError:
return False, None
if not bool(parsed_url.scheme) or parsed_url.scheme not in (
Expand Down
8 changes: 4 additions & 4 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from functools import lru_cache
from typing import Any, List, Optional, Set, Tuple, Union
from urllib.parse import urljoin, urlparse, urlsplit, urlunsplit, ParseResult
from urllib.parse import urljoin, urlsplit, urlunsplit, SplitResult

from tld import get_tld

Expand Down Expand Up @@ -67,11 +67,11 @@ def extract_domain(
return full_domain


def _parse(url: Any) -> ParseResult:
def _parse(url: Any) -> SplitResult:
"Parse a string or use urllib.parse object directly."
if isinstance(url, str):
parsed_url = urlparse(url)
elif isinstance(url, ParseResult):
parsed_url = urlsplit(url)
elif isinstance(url, SplitResult):
parsed_url = url
else:
raise TypeError("wrong input type:", type(url))
Expand Down
6 changes: 3 additions & 3 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from contextlib import redirect_stdout
from unittest.mock import patch
from urllib.parse import ParseResult, urlsplit
from urllib.parse import SplitResult, urlsplit

import pytest

Expand Down Expand Up @@ -672,9 +672,9 @@ def test_urlutils():
assert extract_domain("http://example.com#fragment", fast=True) == "example.com"
# url parsing
result = _parse("https://httpbin.org/")
assert isinstance(result, ParseResult)
assert isinstance(result, SplitResult)
newresult = _parse(result)
assert isinstance(result, ParseResult)
assert isinstance(result, SplitResult)
with pytest.raises(TypeError):
result = _parse(1.23)

Expand Down