Skip to content

Commit

Permalink
maintenance: simplify code (#79)
Browse files Browse the repository at this point in the history
* simplify code

* delete line
  • Loading branch information
adbar authored Jan 16, 2024
1 parent 80fdf0a commit bc2e64c
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 41 deletions.
48 changes: 23 additions & 25 deletions courlan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,33 +151,31 @@ def _cli_process(args: Any) -> None:
break
batches.append(line_batch)

if batches:
futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)

batches = []
else:
if not batches:
break

futures = (
executor.submit(
_cli_check_urls,
batch,
strict=args.strict,
with_redirects=args.redirects,
language=args.language,
)
for batch in batches
)

for future in as_completed(futures):
for valid, url in future.result():
if valid:
outputfh.write(url + "\n")
# proceed with discarded URLs. to be rewritten
elif args.discardedfile is not None:
with open(
args.discardedfile, "a", encoding="utf-8"
) as discardfh:
discardfh.write(url)


def process_args(args: Any) -> None:
"""Start processing according to the arguments"""
Expand Down
11 changes: 3 additions & 8 deletions courlan/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,17 +167,12 @@ def domain_filter(domain: str) -> bool:
return False

# unsuitable content
if UNSUITABLE_DOMAIN.match(domain):
return False

if FILE_TYPE.search(domain):
if UNSUITABLE_DOMAIN.match(domain) or FILE_TYPE.search(domain):
return False

# extensions
extension_match = EXTENSION_REGEX.search(domain)
if extension_match and extension_match[0] in WHITELISTED_EXTENSIONS:
return False

return True
return not extension_match or extension_match[0] not in WHITELISTED_EXTENSIONS


def extension_filter(urlpath: str) -> bool:
Expand Down
14 changes: 6 additions & 8 deletions courlan/urlutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,12 @@ def extract_domain(
blacklist = set()
# new code: Python >= 3.6 with tld module
domain, full_domain = get_tldinfo(url, fast=fast)
# invalid input
if full_domain is None:
return None
# blacklisting
if domain in blacklist or full_domain in blacklist:
return None
# return domain
return full_domain

return (
full_domain
if full_domain and not domain in blacklist and not full_domain in blacklist
else None
)


def _parse(url: Any) -> SplitResult:
Expand Down

0 comments on commit bc2e64c

Please sign in to comment.