From d71e7a063a56af8a75d7bf64bc6872b8fa001161 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 18 Jul 2024 11:44:09 +0000 Subject: [PATCH 1/3] Add support for redirects in meta http-equiv tag --- CHANGELOG.md | 4 + src/warc2zim/content_rewriting/html.py | 32 ++++ test-website/content/http-equiv-redirect.html | 23 +++ test-website/content/index.html | 1 + tests/conftest.py | 9 +- tests/test_html_rewriting.py | 146 ++++++++++++++++++ 6 files changed, 211 insertions(+), 4 deletions(-) create mode 100644 test-website/content/http-equiv-redirect.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 2da34bd..48a2582 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Properly rewrite redirect target url when present in HTML tag (#237) + ### Changed - Generate fuzzy rules tests in Python and Javascript (#284) diff --git a/src/warc2zim/content_rewriting/html.py b/src/warc2zim/content_rewriting/html.py index 4b852fb..78312aa 100644 --- a/src/warc2zim/content_rewriting/html.py +++ b/src/warc2zim/content_rewriting/html.py @@ -1,4 +1,5 @@ import io +import re from collections import namedtuple from collections.abc import Callable from dataclasses import dataclass @@ -19,6 +20,10 @@ RewritenHtml = namedtuple("RewritenHmtl", ["title", "content"]) +HTTP_EQUIV_REDIRECT_RE = re.compile( + r"^\s*(?P.*?)\s*;\s*url\s*=\s*(?P.*?)\s*$" +) + def get_attr_value_from( attrs: AttrsList, name: str, default: str | None = None @@ -626,3 +631,30 @@ def rewrite_js_data( data, opts={"isModule": html_rewrite_context == "js-module"}, ) + + +@rules.rewrite_attribute() +def rewrite_meta_http_equiv_redirect( + tag: str, + attr_name: str, + attr_value: str | None, + attrs: AttrsList, + url_rewriter: ArticleUrlRewriter, + base_href: str | None, +) -> AttrNameAndValue | None: + """Rewrite redirect URL in meta http-equiv refresh""" + if tag != "meta": + return + if attr_name != "content": + return + if not attr_value: + return + http_equiv = get_attr_value_from(attrs, "http-equiv") + if http_equiv != "refresh": + return + if (match := HTTP_EQUIV_REDIRECT_RE.match(attr_value)) is None: + return + return ( + attr_name, + f"{match['interval']};url={url_rewriter(match['url'], base_href=base_href)}", + ) diff --git a/test-website/content/http-equiv-redirect.html b/test-website/content/http-equiv-redirect.html new file mode 100644 index 0000000..7889c01 --- /dev/null +++ b/test-website/content/http-equiv-redirect.html @@ -0,0 +1,23 @@ + + + + + + Test website + + + + + + + + + + +

Redirect with http-equiv meta directive

+ +

You should be redirected to home page in 3 seconds

+ + + + diff --git a/test-website/content/index.html b/test-website/content/index.html index 8197da2..17d8add 100644 --- a/test-website/content/index.html +++ b/test-website/content/index.html @@ -49,6 +49,7 @@
  • links to folder instead of file
  • Bad redirections
  • Handling of content types
  • +
  • Redirect with http-equiv meta directive
  • diff --git a/tests/conftest.py b/tests/conftest.py index d01c845..3e893bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,8 +21,9 @@ def no_js_notify_handler(_: str): class SimpleUrlRewriter(ArticleUrlRewriter): """Basic URL rewriter mocking most calls""" - def __init__(self, article_url: HttpUrl): + def __init__(self, article_url: HttpUrl, suffix: str = ""): self.article_url = article_url + self.suffix = suffix def __call__( self, @@ -31,7 +32,7 @@ def __call__( *, rewrite_all_url: bool = True, # noqa: ARG002 ) -> str: - return item_url + return item_url + self.suffix def get_item_path( self, item_url: str, base_href: str | None # noqa: ARG002 @@ -48,8 +49,8 @@ def get_document_uri( def simple_url_rewriter(): """Fixture to create a basic url rewriter returning URLs as-is""" - def get_simple_url_rewriter(url: str): - return SimpleUrlRewriter(HttpUrl(url)) + def get_simple_url_rewriter(url: str, suffix: str = ""): + return SimpleUrlRewriter(HttpUrl(url), suffix=suffix) yield get_simple_url_rewriter diff --git a/tests/test_html_rewriting.py b/tests/test_html_rewriting.py index 658e510..2616064 100644 --- a/tests/test_html_rewriting.py +++ b/tests/test_html_rewriting.py @@ -11,6 +11,7 @@ extract_base_href, format_attr, get_attr_value_from, + rewrite_meta_http_equiv_redirect, ) from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath @@ -903,6 +904,43 @@ def test_rewrite_meta_charset(rewrite_meta_charset_content, no_js_notify): ) +@pytest.fixture( + params=[ + ContentForTests( + '' + "whatever", + '' + "whatever", + ), + ] +) +def rewrite_meta_http_equiv_redirect_full_content(request): + yield request.param + + +def test_rewrite_meta_http_equiv_redirect_full( + rewrite_meta_http_equiv_redirect_full_content, no_js_notify +): + assert ( + HtmlRewriter( + ArticleUrlRewriter( + HttpUrl( + f"http://{rewrite_meta_http_equiv_redirect_full_content.article_url}" + ), + {ZimPath("kiwix.org/somepage")}, + ), + "", + "", + no_js_notify, + ) + .rewrite(rewrite_meta_http_equiv_redirect_full_content.input_str) + .content + == rewrite_meta_http_equiv_redirect_full_content.expected_str + ) + + rules = HTMLRewritingRules() @@ -1355,3 +1393,111 @@ def test_bad_html_data_rewrite_rules_argument_type(): @bad_rules.rewrite_data() def bad_signature(data: int) -> str | None: return f"{data}" + + +@pytest.mark.parametrize( + "tag, attr_name, attr_value, attrs, expected_result", + [ + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + ("content", "1;url=http://www.example.com/somewhererewritten"), + id="nomimal_case", + ), + pytest.param( + "meta", + "content", + " 1 ; url = http://www.example.com/somewhere ", + [("http-equiv", "refresh")], + ("content", "1;url=http://www.example.com/somewhererewritten"), + id="nomimal_case_with_spaces", + ), + pytest.param( + "foo", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_foo_tag", + ), + pytest.param( + "meta", + "foo", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_foo_attribute", + ), + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [("http-equiv", "foo")], + None, + id="do_not_rewrite_http_equiv_not_refresh", + ), + pytest.param( + "meta", + "content", + "1;url=http://www.example.com/somewhere", + [], + None, + id="do_not_rewrite_no_http_equiv", + ), + pytest.param( + "meta", + "content", + None, + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_missing_attribute", + ), + pytest.param( + "meta", + "content", + "", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_empty_attribute", + ), + pytest.param( + "meta", + "content", + "1", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_attribute_without_url", + ), + pytest.param( + "meta", + "content", + "1;foo=http://www.example.com/somewhere", + [("http-equiv", "refresh")], + None, + id="do_not_rewrite_bad_attribute", + ), + ], +) +def test_rewrite_meta_http_equiv_redirect_rule( + tag: str, + attr_name: str, + attr_value: str | None, + attrs: AttrsList, + expected_result: AttrNameAndValue | None, + simple_url_rewriter, +): + url_rewriter = simple_url_rewriter("http://www.example.com", suffix="rewritten") + + assert ( + rewrite_meta_http_equiv_redirect( + tag=tag, + attr_name=attr_name, + attr_value=attr_value, + attrs=attrs, + url_rewriter=url_rewriter, + base_href=None, + ) + == expected_result + ) From 6ed3756638b08aac5465a56e59822a05bff4e4ab Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 30 Jul 2024 19:09:51 +0000 Subject: [PATCH 2/3] Fix CHANGELOG entry added to wrong section --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48a2582..40f99e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- New fuzzy-rule for cheatography.com (#342), der-postillon.com (#330) - Properly rewrite redirect target url when present in HTML tag (#237) ### Changed @@ -34,7 +35,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Exit with cleaner message when no entries are expected in the ZIM (#336) and when main entry is not processable (#337) - Add debug log for items whose content is empty (#344) -- New fuzzy-rule for cheatography.com (#342), der-postillon.com (#330) ### Fixed From 1f95bc2dbbb87d06e278e07b5f8a2e346ea478b7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 30 Jul 2024 19:12:47 +0000 Subject: [PATCH 3/3] Add missing CHANGELOG entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40f99e1..8bd7dac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Generate fuzzy rules tests in Python and Javascript (#284) +- Refactor HTML rewriter class to make it more open to change and expressive (#305) ### Fixed