Skip to content

Commit

Permalink
[#799] replaced parse_url() with urlparse()
Browse files Browse the repository at this point in the history
  • Loading branch information
LeXofLeviafan committed Dec 8, 2024
1 parent 789c08f commit aa87b88
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 35 deletions.
52 changes: 31 additions & 21 deletions buku
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ from typing import Any, Dict, List, Optional, Tuple, NamedTuple
from collections.abc import Sequence, Set, Callable
from warnings import warn
import xml.etree.ElementTree as ET
from urllib.parse import urlparse # urllib3.util.parse_url() encodes netloc

import urllib3
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
from urllib3.exceptions import LocationParseError
from urllib3.util import Retry, make_headers, parse_url
from urllib3.util import Retry, make_headers

try:
from mypy_extensions import TypedDict
Expand Down Expand Up @@ -4053,6 +4053,20 @@ def import_html(html_soup: BeautifulSoup, add_parent_folder_as_tag: bool, newtag
)


def get_netloc(url):
"""Get the netloc token, or None."""

try:
netloc = urlparse(url).netloc
if not netloc and not urlparse(url).scheme:
# Try to prepend '//' and get netloc
netloc = urlparse('//' + url).netloc
return netloc or None
except Exception as e:
LOGERR('%s, URL: %s', e, url)
return None


def is_bad_url(url):
"""Check if URL is malformed.
Expand All @@ -4069,16 +4083,8 @@ def is_bad_url(url):
True if URL is malformed, False otherwise.
"""

# Get the netloc token
try:
netloc = parse_url(url).netloc
if not netloc:
# Try of prepend '//' and get netloc
netloc = parse_url('//' + url).netloc
if not netloc:
return True
except LocationParseError as e:
LOGERR('%s, URL: %s', e, url)
netloc = get_netloc(url)
if not netloc:
return True

LOGDBG('netloc: %s', netloc)
Expand All @@ -4088,10 +4094,7 @@ def is_bad_url(url):
return True

# netloc should have at least one '.'
if netloc.rfind('.') < 0:
return True

return False
return '.' not in netloc


def is_nongeneric_url(url):
Expand Down Expand Up @@ -4277,6 +4280,14 @@ def get_data_from_page(resp):
return (None, None, None)


def extract_auth(url):
"""Convert an url into an (auth, url) tuple [the returned URL will contain no auth part]."""
_url = urlparse(url)
if _url.username is None: # no '@' in netloc
return None, url
auth = _url.username + ('' if _url.password is None else f':{_url.password}')
return auth, url.replace(auth + '@', '')

def gen_headers():
"""Generate headers for network connection."""

Expand All @@ -4293,15 +4304,14 @@ def gen_headers():
MYPROXY = os.environ.get('https_proxy')
if MYPROXY:
try:
url = parse_url(MYPROXY)
auth, MYPROXY = extract_auth(MYPROXY)
except Exception as e:
LOGERR(e)
return

# Strip username and password (if present) and update headers
if url.auth:
MYPROXY = MYPROXY.replace(url.auth + '@', '')
auth_headers = make_headers(basic_auth=url.auth)
if auth:
auth_headers = make_headers(basic_auth=auth)
MYHEADERS.update(auth_headers)

LOGDBG('proxy: [%s]', MYPROXY)
Expand Down Expand Up @@ -5179,7 +5189,7 @@ def browse(url):
If True, tries to open links in a GUI based browser.
"""

if not parse_url(url).scheme:
if not urlparse(url).scheme:
# Prefix with 'http://' if no scheme
# Otherwise, opening in browser fails anyway
# We expect http to https redirection
Expand Down
66 changes: 52 additions & 14 deletions tests/test_buku.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pytest

from buku import DELIM, FIELD_FILTER, ALL_FIELDS, SortKey, FetchResult, is_int, prep_tag_search, \
print_rec_with_filter, parse_range, split_by_marker
print_rec_with_filter, get_netloc, extract_auth, parse_range, split_by_marker


def check_import_html_results_contains(result, expected_result):
Expand All @@ -26,20 +26,58 @@ def check_import_html_results_contains(result, expected_result):
return count == n * (n + 1) / 2


@pytest.mark.parametrize(
"url, exp_res",
[
["http://example.com", False],
["ftp://ftp.somedomain.org", False],
["http://examplecom.", True],
["http://.example.com", True],
["http://example.com.", True],
["about:newtab", True],
["chrome://version/", True],
],
)
@pytest.mark.parametrize('url, result', [
('http://user:password@hostname:1234/path?query#hash', 'user:password'),
('http://:password@hostname:1234/path?query#hash', ':password'),
('http://user:@hostname:1234/path?query#hash', 'user:'),
('http://user@hostname:1234/path?query#hash', 'user'),
('http://@hostname:1234/path?query#hash', ''),
('http://hostname:1234/path?query#hash', None),
('//[', ValueError('Invalid IPv6 URL')),
('//⁈', ValueError("netloc '⁈' contains invalid characters under NFKC normalization")),
])
def test_extract_auth(url, result):
if not isinstance(result, Exception):
assert extract_auth(url) == (result, 'http://hostname:1234/path?query#hash')
else:
try:
extract_auth(url)
except Exception as e:
assert repr(e) == repr(result)
else:
assert False, f'expected {repr(result)} to be raised'


@pytest.mark.parametrize('url, netloc', [
['http://example.com', 'example.com'],
['example.com/#foo/bar', 'example.com'],
['ftp://ftp.somedomain.org', 'ftp.somedomain.org'],
['about:newtab', None],
['chrome://version/', 'version'],
['javascript:void(0.0)', None],
['data:,text.with.dots', None],
['http://[', None], # parsing error
['http://⁈', None], # parsing error
])
def test_get_netloc(url, netloc):
assert get_netloc(url) == netloc


@pytest.mark.parametrize('url, exp_res', [
['http://example.com', False],
['example.com/#foo/bar', False],
['ftp://ftp.somedomain.org', False],
['http://examplecom.', True], # ends with a '.'
['http://.example.com', True], # starts with a '.'
['http://example.com.', True], # ends with a '.'
['about:newtab', True],
['chrome://version/', True], # contains no '.'
['javascript:void(0.0)', True],
['data:,text.with.dots', True],
['http://[', True], # parsing error
['http://⁈', True], # parsing error
])
def test_is_bad_url(url, exp_res):
"""test func."""
import buku

res = buku.is_bad_url(url)
Expand Down

0 comments on commit aa87b88

Please sign in to comment.