Skip to content

Commit

Permalink
Rework transformation of WARC record url to ZIM path and URL normaliz…
Browse files Browse the repository at this point in the history
…ation to support encoded URL and query strings
  • Loading branch information
benoit74 committed Mar 18, 2024
1 parent 4068d85 commit 17eabca
Show file tree
Hide file tree
Showing 12 changed files with 852 additions and 290 deletions.
12 changes: 8 additions & 4 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from warc2zim.content_rewriting.html import HtmlRewriter
from warc2zim.content_rewriting.js import JsRewriter
from warc2zim.content_rewriting.rx_replacer import RxRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
from warc2zim.utils import (
get_record_content,
get_record_encoding,
Expand Down Expand Up @@ -59,7 +59,7 @@ def __init__(
self,
path: str,
record: ArcWarcRecord,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
self.content = get_record_content(record)

Expand All @@ -69,7 +69,9 @@ def __init__(

self.path = path
self.orig_url_str = get_record_url(record)
self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
self.url_rewriter = ArticleUrlRewriter(
HttpUrl(self.orig_url_str), existing_zim_paths
)

self.rewrite_mode = self.get_rewrite_mode(record, mimetype)

Expand Down Expand Up @@ -133,7 +135,9 @@ def get_rewrite_mode(self, record, mimetype):
def rewrite_html(self, head_template: Template, css_insert: str | None):
orig_url = urlsplit(self.orig_url_str)

rel_static_prefix = self.url_rewriter.from_normalized("_zim_static/")
rel_static_prefix = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/"), ""
)
head_insert = head_template.render(
path=self.path,
static_prefix=rel_static_prefix,
Expand Down
4 changes: 2 additions & 2 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ def handle_data(self, data: str):
elif self._active_tag == "style":
data = self.css_rewriter.rewrite(data)
elif self._active_tag == "script":
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if data.strip():
data = JsRewriter(self.url_rewriter, rules).rewrite(data)
elif self._active_tag == "json":
if data.strip():
rules = get_ds_rules(self.url_rewriter.article_url)
rules = get_ds_rules(self.url_rewriter.article_url.value)
if rules:
data = RxRewriter(rules).rewrite(data, {})
self.send(data)
Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/content_rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
replace,
replace_prefix_from,
)
from warc2zim.url_rewriting import ZimPath

# Regex used to check if we ar import or exporting things in the string
# ie : If we are a module
Expand Down Expand Up @@ -236,8 +237,8 @@ def _get_module_decl(self, local_decls: Iterable[str]) -> str:
This will be added to script only if the script is a module script.
"""
wb_module_decl_url = self.url_rewriter.from_normalized(
"_zim_static/__wb_module_decl.js"
wb_module_decl_url = self.url_rewriter.get_document_uri(
ZimPath("_zim_static/__wb_module_decl.js"), ""
)
return (
f"""import {{ {", ".join(local_decls)} }} from "{wb_module_decl_url}";\n"""
Expand Down
83 changes: 47 additions & 36 deletions src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import time
from http import HTTPStatus
from pathlib import Path
from urllib.parse import urldefrag, urljoin, urlsplit, urlunsplit
from urllib.parse import urljoin, urlsplit, urlunsplit

import requests

Expand All @@ -46,7 +46,7 @@

from warc2zim.constants import logger
from warc2zim.items import StaticArticle, WARCPayloadItem
from warc2zim.url_rewriting import FUZZY_RULES, normalize
from warc2zim.url_rewriting import FUZZY_RULES, HttpUrl, ZimPath, normalize
from warc2zim.utils import (
get_record_content,
get_record_mime_type,
Expand Down Expand Up @@ -88,14 +88,15 @@ def __init__(self, args):
for handler in logger.handlers:
handler.setLevel(logging.DEBUG)

main_url: str = args.url
main_url: str | None = str(args.url) if args.url else None
# ensure trailing slash is added if missing
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)
if main_url:
parts = urlsplit(main_url)
if parts.path == "":
parts = list(parts)
# set path
parts[2] = "/"
main_url = urlunsplit(parts)

self.name = args.name
self.title = args.title
Expand All @@ -107,10 +108,10 @@ def __init__(self, args):
self.creator_metadata = args.creator
self.publisher = args.publisher
self.tags = DEFAULT_TAGS + (args.tags or [])
self.source: str = args.source or main_url
self.source: str | None = str(args.source) if args.source else None or main_url
self.scraper = "warc2zim " + get_version()
self.illustration = b""
self.main_path = normalize(main_url)
self.main_path = normalize(HttpUrl(main_url)) if main_url else None

self.output = Path(args.output)
self.zim_file = args.zim_file
Expand All @@ -132,8 +133,8 @@ def __init__(self, args):
self.custom_css = args.custom_css

self.indexed_urls = set({})
self.revisits = {}
self.warc_urls = set({})
self.revisits: dict[ZimPath, ZimPath] = {}
self.existing_zim_paths: set[ZimPath] = set()

# progress file handling
self.stats_filename = (
Expand Down Expand Up @@ -202,6 +203,8 @@ def run(self):
return 100

self.gather_information_from_warc()
if not self.main_path:
raise ValueError("Unable to find main path, aborting")

Check warning on line 207 in src/warc2zim/converter.py

View check run for this annotation

Codecov / codecov/patch

src/warc2zim/converter.py#L207

Added line #L207 was not covered by tests
self.title = self.title or "Untitled"
if len(self.title) > RECOMMENDED_MAX_TITLE_LENGTH:
self.title = f"{self.title[0:29]}…"
Expand Down Expand Up @@ -229,7 +232,7 @@ def run(self):

self.creator = Creator(
self.full_filename,
main_path=self.main_path,
main_path=self.main_path.value,
)

self.creator.config_metadata(
Expand All @@ -250,7 +253,7 @@ def run(self):
for filename in importlib.resources.files("warc2zim.statics").iterdir():
with importlib.resources.as_file(filename) as file:
self.creator.add_item(
StaticArticle(filename=file, main_path=self.main_path)
StaticArticle(filename=file, main_path=self.main_path.value)
)

# Add wombat_setup.js
Expand All @@ -272,7 +275,9 @@ def run(self):
if normalized_url not in self.indexed_urls:
logger.debug(f"Adding alias {normalized_url} -> {target_url}")
try:
self.creator.add_alias(normalized_url, "", target_url, {})
self.creator.add_alias(
normalized_url.value, "", target_url.value, {}
)
except RuntimeError as exc:
if not ALIAS_EXC_STR.match(str(exc)):
raise exc
Expand All @@ -299,9 +304,9 @@ def gather_information_from_warc(self):
continue

url = get_record_url(record)
normalized_url = normalize(url)
zim_path = normalize(HttpUrl(url))

self.warc_urls.add(normalized_url)
self.existing_zim_paths.add(zim_path)

if main_page_found:
continue
Expand All @@ -322,9 +327,9 @@ def gather_information_from_warc(self):
or record.http_headers.get_statuscode() == "200"
)
):
self.main_path = normalized_url
self.main_path = zim_path

if urldefrag(self.main_path).url != normalized_url:
if self.main_path != zim_path:
continue

# if we get here, found record for the main page
Expand All @@ -338,14 +343,16 @@ def gather_information_from_warc(self):
]:
original_path = self.main_path
self.main_path = normalize(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
HttpUrl(
urljoin(
get_record_url(record),
record.http_headers.get_header("Location").strip(),
)
)
)
logger.warning(
f"HTTP {status_code} occurred on main page; "
f"replacing {original_path} with '{self.main_path}'"
f"replacing {original_path} with {self.main_path}"
)
continue

Expand Down Expand Up @@ -399,9 +406,11 @@ def find_icon_and_language(self, record, content):

# transform icon URL into WARC path
self.favicon_path = normalize(
urljoin(
get_record_url(record),
icon_url,
HttpUrl(
urljoin(
get_record_url(record),
icon_url,
)
)
)

Expand Down Expand Up @@ -443,7 +452,7 @@ def retrieve_illustration(self):
if record.rec_type != "response":
continue
url = get_record_url(record)
path = normalize(url)
path = normalize(HttpUrl(url))
if path == self.favicon_path or url == self.favicon_url:
logger.debug("Found WARC record for favicon")
if (
Expand Down Expand Up @@ -503,22 +512,20 @@ def is_self_redirect(self, record, url):
return False

location = record.http_headers.get("Location", "")
return normalize(url) == normalize(location)
location = urljoin(url, location)
return normalize(HttpUrl(url)) == normalize(HttpUrl(location))

def add_items_for_warc_record(self, record):

if record.rec_type not in ("response", "revisit"):
return

url = get_record_url(record)
normalized_url = normalize(url)
if not url:
logger.debug(f"Skipping record with empty WARC-Target-URI {record}")
return

if normalized_url in self.indexed_urls:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return
normalized_url = normalize(HttpUrl(url))

# if include_domains is set, only include urls from those domains
if self.include_domains:
Expand All @@ -529,17 +536,21 @@ def add_items_for_warc_record(self, record):
logger.debug(f"Skipping url {url}, outside included domains")
return

if normalized_url in self.indexed_urls:
logger.debug(f"Skipping duplicate {url}, already added to ZIM")
return

if record.rec_type == "response":
if self.is_self_redirect(record, url):
logger.debug("Skipping self-redirect: " + url)
return

payload_item = WARCPayloadItem(
normalized_url,
normalized_url.value,
record,
self.head_template,
self.css_insert,
self.warc_urls,
self.existing_zim_paths,
)

if len(payload_item.content) != 0:
Expand All @@ -559,7 +570,7 @@ def add_items_for_warc_record(self, record):
and normalized_url not in self.revisits
): # pragma: no branch
self.revisits[normalized_url] = normalize(
record.rec_headers["WARC-Refers-To-Target-URI"]
HttpUrl(record.rec_headers["WARC-Refers-To-Target-URI"])
)


Expand Down
5 changes: 3 additions & 2 deletions src/warc2zim/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from zimscraperlib.zim.items import StaticItem

from warc2zim.content_rewriting.generic import Rewriter
from warc2zim.url_rewriting import ZimPath
from warc2zim.utils import get_record_mime_type


Expand All @@ -29,13 +30,13 @@ def __init__(
record: ArcWarcRecord,
head_template: Template,
css_insert: str | None,
known_urls: set[str],
existing_zim_paths: set[ZimPath],
):
super().__init__()

self.path = path
self.mimetype = get_record_mime_type(record)
(self.title, self.content) = Rewriter(path, record, known_urls).rewrite(
(self.title, self.content) = Rewriter(path, record, existing_zim_paths).rewrite(
head_template, css_insert
)

Expand Down
Loading

0 comments on commit 17eabca

Please sign in to comment.