From d71e7a063a56af8a75d7bf64bc6872b8fa001161 Mon Sep 17 00:00:00 2001
From: benoit74
Date: Thu, 18 Jul 2024 11:44:09 +0000
Subject: [PATCH 1/3] Add support for redirects in meta http-equiv tag
---
CHANGELOG.md | 4 +
src/warc2zim/content_rewriting/html.py | 32 ++++
test-website/content/http-equiv-redirect.html | 23 +++
test-website/content/index.html | 1 +
tests/conftest.py | 9 +-
tests/test_html_rewriting.py | 146 ++++++++++++++++++
6 files changed, 211 insertions(+), 4 deletions(-)
create mode 100644 test-website/content/http-equiv-redirect.html
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2da34bd..48a2582 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+
+- Properly rewrite redirect target url when present in HTML tag (#237)
+
### Changed
- Generate fuzzy rules tests in Python and Javascript (#284)
diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py
index 4b852fb..78312aa 100644
--- a/src/warc2zim/content_rewriting/html.py
+++ b/src/warc2zim/content_rewriting/html.py
@@ -1,4 +1,5 @@
import io
+import re
from collections import namedtuple
from collections.abc import Callable
from dataclasses import dataclass
@@ -19,6 +20,10 @@
RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"])
+HTTP_EQUIV_REDIRECT_RE = re.compile(
+ r"^\s*(?P.*?)\s*;\s*url\s*=\s*(?P.*?)\s*$"
+)
+
def get_attr_value_from(
attrs: AttrsList, name: str, default: str | None = None
@@ -626,3 +631,30 @@ def rewrite_js_data(
data,
opts={"isModule": html_rewrite_context == "js-module"},
)
+
+
+@rules.rewrite_attribute()
+def rewrite_meta_http_equiv_redirect(
+ tag: str,
+ attr_name: str,
+ attr_value: str | None,
+ attrs: AttrsList,
+ url_rewriter: ArticleUrlRewriter,
+ base_href: str | None,
+) -> AttrNameAndValue | None:
+ """Rewrite redirect URL in meta http-equiv refresh"""
+ if tag != "meta":
+ return
+ if attr_name != "content":
+ return
+ if not attr_value:
+ return
+ http_equiv = get_attr_value_from(attrs, "http-equiv")
+ if http_equiv != "refresh":
+ return
+ if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None:
+ return
+ return (
+ attr_name,
+ f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}",
+ )
diff --git a/test-website/content/http-equiv-redirect.html b/test-website/content/http-equiv-redirect.html
new file mode 100644
index 0000000..7889c01
--- /dev/null
+++ b/test-website/content/http-equiv-redirect.html
@@ -0,0 +1,23 @@
+
+
+
+
+
+ Test website
+
+
+
+
+
+
+
+
+
+
+ Redirect with http-equiv meta directive
+
+ You should be redirected to home page in 3 seconds
+
+
+
+
diff --git a/test-website/content/index.html b/test-website/content/index.html
index 8197da2..17d8add 100644
--- a/test-website/content/index.html
+++ b/test-website/content/index.html
@@ -49,6 +49,7 @@
links to folder instead of file
Bad redirections
Handling of content types
+ Redirect with http-equiv meta directive
whatever
diff --git a/tests/conftest.py b/tests/conftest.py
index d01c845..3e893bc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,8 +21,9 @@ def no_js_notify_handler(_: str):
class SimpleUrlRewriter(ArticleUrlRewriter):
"""Basic URL rewriter mocking most calls"""
- def __init__(self, article_url: HttpUrl):
+ def __init__(self, article_url: HttpUrl, suffix: str = ""):
self.article_url = article_url
+ self.suffix = suffix
def __call__(
self,
@@ -31,7 +32,7 @@ def __call__(
*,
rewrite_all_url: bool = True, # noqa: ARG002
) -> str:
- return item_url
+ return item_url + self.suffix
def get_item_path(
self, item_url: str, base_href: str | None # noqa: ARG002
@@ -48,8 +49,8 @@ def get_document_uri(
def simple_url_rewriter():
"""Fixture to create a basic url rewriter returning URLs as-is"""
- def get_simple_url_rewriter(url: str):
- return SimpleUrlRewriter(HttpUrl(url))
+ def get_simple_url_rewriter(url: str, suffix: str = ""):
+ return SimpleUrlRewriter(HttpUrl(url), suffix=suffix)
yield get_simple_url_rewriter
diff --git a/tests/test_html_rewriting.py b/tests/test_html_rewriting.py
index 658e510..2616064 100644
--- a/tests/test_html_rewriting.py
+++ b/tests/test_html_rewriting.py
@@ -11,6 +11,7 @@
extract_base_href,
format_attr,
get_attr_value_from,
+ rewrite_meta_http_equiv_redirect,
)
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
@@ -903,6 +904,43 @@ def test_rewrite_meta_charset(rewrite_meta_charset_content, no_js_notify):
)
+@pytest.fixture(
+ params=[
+ ContentForTests(
+ '
'
+ "