Skip to content

Commit

Permalink
Merge pull request #270 from openzim/onxxx_events
Browse files Browse the repository at this point in the history
Add support for onxxx HTML events
  • Loading branch information
benoit74 authored May 24, 2024
2 parents ab2f17d + 51a50a6 commit a98e6cf
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 8 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ Scenario which are known to work well:
- Redirections with `meta http-equiv` are not yet supported (see https://github.com/openzim/warc2zim/issues/237)
- Web workers are not yet supported (see https://github.com/openzim/warc2zim/issues/272)
- Service workers are not supported and will most probably never be
- Inline JS code inside an onxxx HTML event (e.g. onclick, onhover, ...) is rewritten, so for instance redirection to another handled with these events is working
- However since URL rewriting is performed with dynamic JS rewriting, at this stage scraper has no clue on what is inside the ZIM and what is external ; all URLs are hence supposed to be internal, which might break some dynamic redirection to an online website

It is also important to note that warc2zim is inherently limited to what is present inside the WARC. A bad WARC can only produce a bad ZIM. Garbage in, garbage out.

Expand Down
15 changes: 9 additions & 6 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ def rewrite(self, content: str) -> RewritenHtml:
self.url_rewriter, base_href=self.base_href, rewrite_all_url=False
)
self.css_rewriter = CssRewriter(self.url_rewriter, self.base_href)
self.js_rewriter = JsRewriter(
url_rewriter=self.url_rewriter,
base_href=self.base_href,
extra_rules=get_ds_rules(self.url_rewriter.article_url.value),
notify_js_module=self.notify_js_module,
)

self.feed(content)
self.close()
Expand Down Expand Up @@ -149,12 +155,7 @@ def handle_data(self, data: str):
data = RxRewriter(rules).rewrite(data, {})
elif self.html_rewrite_context and self.html_rewrite_context.startswith("js-"):
if data.strip():
data = JsRewriter(
url_rewriter=self.url_rewriter,
base_href=self.base_href,
extra_rules=get_ds_rules(self.url_rewriter.article_url.value),
notify_js_module=self.notify_js_module,
).rewrite(
data = self.js_rewriter.rewrite(
data,
opts={"isModule": self.html_rewrite_context == "js-module"},
)
Expand Down Expand Up @@ -206,6 +207,8 @@ def process_attr(
return (attr_name, ", ".join(new_value_list))
if attr_name == "style":
return (attr_name, self.css_rewriter.rewrite_inline(attr_value))
if attr_name.startswith("on") and not attr_name.startswith("on-"):
return (attr_name, self.js_rewriter.rewrite(attr_value))
return (attr_name, attr_value)

def format_attr(self, name: str, value: str | None) -> str:
Expand Down
1 change: 1 addition & 0 deletions test-website/content/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
<li><a href="./j%C3%A0v%C3%A0scr%C3%AFpt.html">Javascript with special chars</a></li>
<li><a href="./http-return-codes.html">HTTP return codes</a></li>
<li><a href="./base-href.html">Base href</a></li>
<li><a href="./onxxx.html">onxxx HTML events</a></li>
</ul>
</body>

Expand Down
38 changes: 38 additions & 0 deletions test-website/content/onxxx.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="utf-8">
<title>Test website</title>
<link rel="apple-touch-icon" sizes="180x180" href="./icons/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="./icons/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="./icons/favicon-16x16.png">
<link rel="manifest" href="./icons/site.webmanifest">
<link rel="shortcut icon" href="./icons/favicon.ico">
<style>
.status {
color: red;
}
.status.green {
color: green;
}
</style>
</head>

<body>

<h2>onxxx HTML events</h2>

<p>When clicking images below, it should redirect your browser to the another page</p>

<p>Relative (<a href="./index.html">this site home page</a>): <img src="images/image1.png" onclick="document.location.href='./index.html';"> </p>

<p>Absolute internal 1 (<a href="https://standard_netloc/index.html">this site home page</a>): <img src="images/image1.png" onclick="document.location.href='https://standard_netloc/index.html';"></p>

<p>Absolute internal 2 (<a href="https://not_standard_netloc_punny_encoded/index.html">another site home page</a>): <img src="images/image1.png" onclick="document.location.href='https://not_standard_netloc_punny_encoded/index.html';"></p>

<p>Absolute external (kiwix homepage): <img src="images/image1.png" onclick="document.location.href='https://www.kiwix.org';"></p>

</body>

</html>
80 changes: 78 additions & 2 deletions tests/test_html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,11 @@ def test_extract_title(no_js_notify):
</html>"""

assert (
# Nota: lambda below is a trick, we should assign an ArticleUrlRewriter
HtmlRewriter(
lambda _: "kiwix.org", # pyright: ignore[reportGeneralTypeIssues, reportArgumentType]
ArticleUrlRewriter(
HttpUrl("http://example.com"),
{ZimPath("exemple.com/a/long/path")},
),
"",
"",
no_js_notify,
Expand Down Expand Up @@ -667,3 +669,77 @@ def test_simple_rewrite(input_content, expected_output, no_js_notify):
.content
== expected_output
)


@pytest.fixture(
params=[
ContentForTests(
"""<img onclick="">""",
),
ContentForTests(
"""<img on-whatever="foo">""",
),
ContentForTests(
"""<img on="foo">""",
),
ContentForTests(
"""<img to="foo">""",
),
ContentForTests(
"""<img onclick="document.location.href='./index.html';">""",
(
"""<img onclick="var _____WB$wombat$assign$function_____ = """
"function(name) {return (self._wb_wombat &amp;&amp; "
"self._wb_wombat.local_init &amp;&amp; "
"self._wb_wombat.local_init(name)) || self[name]; };\n"
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) "
"{ this.__WB_source = obj; return this; } }\n"
"{\n"
"let window = _____WB$wombat$assign$function_____(&quot;window&quot;);"
"\n"
"let globalThis = _____WB$wombat$assign$function_____"
"(&quot;globalThis&quot;);\n"
"let self = _____WB$wombat$assign$function_____(&quot;self&quot;);\n"
"let document = "
"_____WB$wombat$assign$function_____(&quot;document&quot;);\n"
"let location = "
"_____WB$wombat$assign$function_____(&quot;location&quot;);\n"
"let top = _____WB$wombat$assign$function_____(&quot;top&quot;);\n"
"let parent = "
"_____WB$wombat$assign$function_____(&quot;parent&quot;);\n"
"let frames = "
"_____WB$wombat$assign$function_____(&quot;frames&quot;);\n"
"let opener = "
"_____WB$wombat$assign$function_____(&quot;opener&quot;);\n"
"let arguments;\n\n"
"document.location.href=&#x27;./index.html&#x27;;\n"
"""}">"""
), # NOTA: quotes and ampersand are escaped since we are inside HTML attr
),
]
)
def rewrite_onxxx_content(request):
yield request.param


def test_rewrite_onxxx_event(rewrite_onxxx_content, no_js_notify):
assert (
HtmlRewriter(
ArticleUrlRewriter(
HttpUrl(f"http://{rewrite_onxxx_content.article_url}"),
{
ZimPath("kiwix.org/foo.html"),
ZimPath("kiwix.org/foo.js"),
ZimPath("kiwix.org/foo.css"),
ZimPath("kiwix.org/foo.png"),
ZimPath("kiwix.org/favicon.png"),
},
),
"",
"",
no_js_notify,
)
.rewrite(rewrite_onxxx_content.input_str)
.content
== rewrite_onxxx_content.expected_str
)

0 comments on commit a98e6cf

Please sign in to comment.