Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling of protocol relative external urls in zimcheck #307

Merged
merged 3 commits into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,12 @@ void asciitolower(std::string& s)
UriKind html_link::detectUriKind(const std::string& input_string)
{
const auto k = input_string.find_first_of(":/?#");
if ( k == std::string::npos || input_string[k] != ':' )
return UriKind::OTHER;
if ( k == std::string::npos || input_string[k] != ':' ) {
if ( k == 0 && input_string.substr(0, 2) == "//" )
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
return UriKind::PROTOCOL_RELATIVE;
else
return UriKind::OTHER;
}

if ( k + 2 < input_string.size()
&& input_string[k+1] == '/'
Expand Down
1 change: 1 addition & 0 deletions src/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ enum class UriKind : int
URN, // urn:nbn:de:bsz:24-digibib-bsz3530416370

GENERIC_URI, // Generic URI with scheme and authority: <scheme>://.....
PROTOCOL_RELATIVE, // Protocol-relative URL: //<host>/<path>/<to>/<resource>

OTHER // not a valid URI (though it can be a valid relative
// or absolute URL)
Expand Down
Binary file modified test/data/zimfiles/bad_checksum.zim
Binary file not shown.
8 changes: 5 additions & 3 deletions test/data/zimfiles/create_test_zimfiles
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ make__good__zim()
--threads 1 \
--no-uuid \
-w main.html \
-f favicon.png \
-I favicon.png \
-l en \
-t "Test ZIM file" \
-d "N/A" \
Expand Down Expand Up @@ -68,7 +68,9 @@ make__poor__zim()
mv favicon.png image.png
sed -i -e 's!favicon.png!image.png!' main.html
sed -e 's!A/article1.html!!' main.html > empty_link.html
sed -e 's!I/image.png!http://a.io/pic.png!' main.html > external_link.html
sed -e 's!I/image\.png!http://a.io/pic.png!' main.html > external_image_http.html
sed -e 's!I/image\.png!https://a.io/pic.png!' main.html > external_image_https.html
sed -e 's!I/image\.png!//a.io/pic.png!' main.html > external_image_protocol_relative.html
sed -e 's/article1/non_existent/' main.html > dangling_link.html
sed -e 's!A/article1!../../oops!' main.html > outofbounds_link.html
command_set="sed -e '6 a <meta http-equiv=refresh content=\"0;URL=redirect_loop.html\">' main.html"
Expand All @@ -84,7 +86,7 @@ make__poor__zim()
--threads 1 \
--no-uuid \
-w "" \
-f "" \
-I "" \
-l en \
-t "" \
-d "" \
Expand Down
Binary file modified test/data/zimfiles/good.zim
Binary file not shown.
Binary file modified test/data/zimfiles/poor.zim
Binary file not shown.
17 changes: 17 additions & 0 deletions test/data/zimfiles/small_zimfile_data/main.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,27 @@
<h1>Main page</h1>

<img src="I/favicon.png">
<h2>Internal links</h2>
<ul>
<li>
<a href="A/article1.html">Article 1</a>
</li>
</ul>
<h2>External links</h2>
<ul>
<li>
<a href="http://openzim.org">HTTP link</a>
</li>
<li>
<a href="https://openzim.org">HTTPS link</a>
</li>
<li>
<a href="ftp://ftp.openzim.org">FTP link</a>
</li>
<li>
<a href="//openzim.org">Protocol relative link</a>
</li>
</ul>
<img src="">
</body>
</html>
3 changes: 3 additions & 0 deletions test/tools-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ TEST(tools, uriKind)
EXPECT_EQ(UriKind::GENERIC_URI, uriKind("file:///etc/passwd"));
EXPECT_EQ(UriKind::GENERIC_URI, uriKind("ftp://download.kiwix.org/zim/"));

EXPECT_EQ(UriKind::PROTOCOL_RELATIVE, uriKind("//example.com"));

EXPECT_EQ(UriKind::MAILTO, uriKind("mailto:someone@example.com"));
EXPECT_EQ(UriKind::MAILTO, uriKind("MAILTO:someone@example.com"));

Expand Down Expand Up @@ -212,6 +214,7 @@ TEST(tools, uriKind)
EXPECT_EQ(UriKind::OTHER, uriKind("showlocation.cgi?geo:12.34,56.78"));
EXPECT_EQ(UriKind::OTHER, uriKind("/xyz/javascript:console.log('hello, world!')"));

EXPECT_EQ(UriKind::OTHER, uriKind("/"));
EXPECT_EQ(UriKind::OTHER, uriKind("/api/data:text/plain;charset=UTF-8,qwerty"));
EXPECT_EQ(UriKind::OTHER, uriKind("../img/logo.png"));
EXPECT_EQ(UriKind::OTHER, uriKind("style.css"));
Expand Down
26 changes: 22 additions & 4 deletions test/zimcheck-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,9 @@ TEST(zimcheck, external_url_check_poorzimfile)
"[INFO] Zimcheck version is " VERSION "\n"
"[INFO] Verifying Articles' content..." "\n"
"[ERROR] Invalid external links found:" "\n"
" http://a.io/pic.png is an external dependence in article external_link.html" "\n"
" http://a.io/pic.png is an external dependence in article external_image_http.html" "\n"
" https://a.io/pic.png is an external dependence in article external_image_https.html" "\n"
" //a.io/pic.png is an external dependence in article external_image_protocol_relative.html" "\n"
"[INFO] Overall Test Status: Fail" "\n"
"[INFO] Total time taken by zimcheck: <3 seconds." "\n"
);
Expand Down Expand Up @@ -732,7 +734,9 @@ const std::string ALL_CHECKS_OUTPUT_ON_POORZIMFILE(
" Found 1 empty links in article: empty_link.html" "\n"
" ../../oops.html is out of bounds. Article: outofbounds_link.html" "\n"
"[ERROR] Invalid external links found:" "\n"
" http://a.io/pic.png is an external dependence in article external_link.html" "\n"
" http://a.io/pic.png is an external dependence in article external_image_http.html" "\n"
" https://a.io/pic.png is an external dependence in article external_image_https.html" "\n"
" //a.io/pic.png is an external dependence in article external_image_protocol_relative.html" "\n"
"[ERROR] Redirect loop(s) exist:" "\n"
" Redirect loop exists from entry redirect_loop.html" "\n"
"" "\n"
Expand Down Expand Up @@ -882,9 +886,23 @@ TEST(zimcheck, json_poorzimfile)
" {" "\n"
" \"check\" : \"url_external\"," "\n"
" \"level\" : \"ERROR\"," "\n"
" \"message\" : \"http://a.io/pic.png is an external dependence in article external_link.html\"," "\n"
" \"message\" : \"http://a.io/pic.png is an external dependence in article external_image_http.html\"," "\n"
" \"link\" : \"http://a.io/pic.png\"," "\n"
" \"path\" : \"external_link.html\"" "\n"
" \"path\" : \"external_image_http.html\"" "\n"
" }," "\n"
" {" "\n"
" \"check\" : \"url_external\"," "\n"
" \"level\" : \"ERROR\"," "\n"
" \"message\" : \"https://a.io/pic.png is an external dependence in article external_image_https.html\"," "\n"
" \"link\" : \"https://a.io/pic.png\"," "\n"
" \"path\" : \"external_image_https.html\"" "\n"
" }," "\n"
" {" "\n"
" \"check\" : \"url_external\"," "\n"
" \"level\" : \"ERROR\"," "\n"
" \"message\" : \"//a.io/pic.png is an external dependence in article external_image_protocol_relative.html\"," "\n"
" \"link\" : \"//a.io/pic.png\"," "\n"
" \"path\" : \"external_image_protocol_relative.html\"" "\n"
" }," "\n"
" {" "\n"
" \"check\" : \"redirect\"," "\n"
Expand Down