openzim · kelson42 · Nov 15, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/src/tools.cpp b/src/tools.cpp
@@ -276,6 +276,54 @@
  replaceStringInPlace(str, "\u202C", "");
 }
 
+namespace
+{
+
+const char* getHtmlEntity(const std::string& core)
+{
+ static const std::map<std::string, const char*> t = {
+ { "amp", "&" },
+ { "apos", "'" },
+ { "quot", "\"" },
+ { "lt", "<" },
+ { "gt", ">" },
+ };
+
+ const auto it = t.find(core);
+ return it != t.end() ? it->second : nullptr;
+}
+
+} // unnamed namespace
+
+std::string decodeHtmlEntities(const std::string& str)
+{
+ const char* p = str.c_str();
+ std::string result;
+ const char* start = nullptr;
+ for ( ; *p ; ++p ) {
+ if ( *p == '&' ) {
+ if ( start ) {
+ result.insert(result.end(), start, p);
+ }
+ start = p;
+ } else if ( !start ) {
+ result.push_back(*p);
+ } else if ( *p == ';' ) {
+ const char* d = getHtmlEntity(std::string(start+1, p));
+ if ( d ) {
+ result += d;
+ } else {
+ result.insert(result.end(), start, p+1);
+ }
+ start = nullptr;
+ }
+ }
+ if ( start ) {
+ result.insert(result.end(), start, p);
+ }
+ return result;
+}
+
 std::vector<html_link> generic_getLinks(const std::string& page)
 {
  const char* p = page.c_str();
@@ -310,7 +358,7 @@
  while(*p != delimiter)
  p++;
  const std::string link(linkStart, p);
- links.push_back(html_link(attr, link));
+ links.push_back(html_link(attr, decodeHtmlEntities(link)));
  p += 1;
  }
  return links;
@@ -356,7 +404,6 @@
  std::string output;
  output.reserve(baseUrl.size() + input.size() + 1);
 
- bool in_query = false;
  bool check_rel = false;
  const char* p = input.c_str();
  if ( *(p) == '/') {
@@ -373,7 +420,7 @@
  //URL Decoding.
  while (*p)
  {
- if ( !in_query && check_rel ) {
+ if ( check_rel ) {
  if (strncmp(p, "../", 3) == 0) {
  // We must go "up"
  // Remove the '/' at the end of output.
@@ -394,9 +441,13 @@
  continue;
  }
  }
- if ( *p == '#' || *p == '?')
- // This is a beginning of the #anchor inside a page. No need to decode more
+
+ if ( *p == '#' || *p == '?') {
+ // For our purposes we can safely discard the query and/or fragment
+ // components of the URL
  break;
+ }
+
  if ( *p == '%')
  {
  char ch;
@@ -405,10 +456,7 @@
  p += 3;
  continue;
  }
- if ( *p == '?' ) {
- // We are in the query, so don't try to interprete '/' as path separator
- in_query = true;
- }
+
  if ( *p == '/') {
  check_rel = true;
  if (output.empty()) {

diff --git a/src/tools.h b/src/tools.h
@@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base);
 //Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final.
 int adler32(const std::string& buf);
 
+std::string decodeHtmlEntities(const std::string& str);
+
 //Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it.
 //Converts the %20 to space.Essential for comparing URLs.
 std::string normalize_link(const std::string& input, const std::string& baseUrl);

diff --git a/test/tools-test.cpp b/test/tools-test.cpp
@@ -247,6 +247,9 @@
 
  ASSERT_EQ(normalize_link("a", ""), "a");
  ASSERT_EQ(normalize_link("./a", ""), "a");
+
+ // URI-decoding is performed
+ ASSERT_EQ(normalize_link("/%41%62c", "/"), "Abc");
 }
 
 TEST(tools, addler32)
@@ -257,32 +260,149 @@
  ASSERT_EQ(adler32(""), 1);
 }
 
+TEST(tools, decodeHtmlEntities)
+{
+ ASSERT_EQ(decodeHtmlEntities(""), "");
+
+ // Supported HTML character references
+ ASSERT_EQ(decodeHtmlEntities("&amp;"), "&");
+ ASSERT_EQ(decodeHtmlEntities("&apos;"), "'");
+ ASSERT_EQ(decodeHtmlEntities("&quot;"), "\"");
+ ASSERT_EQ(decodeHtmlEntities("&lt;"), "<");
+ ASSERT_EQ(decodeHtmlEntities("&gt;"), ">");
+
+ // All other HTML character references
+ // (https://html.spec.whatwg.org/multipage/syntax.html#character-references)
+ // are NOT currently supported
+ ASSERT_EQ(decodeHtmlEntities("&nbsp;"), "&nbsp;");
+
+ // Capitalized versions of supported ones do NOT work
+ ASSERT_EQ(decodeHtmlEntities("&AMP;"), "&AMP;");
+ ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;");
+
+ // HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded
+ ASSERT_EQ(decodeHtmlEntities("&#65;"), "&#65;" ); // should be "A"
+ ASSERT_EQ(decodeHtmlEntities("&#x41;"), "&#x41;"); // should be "A"
+
+ // Handling of "incomplete" entity
+ ASSERT_EQ(decodeHtmlEntities("&amp"), "&amp");
+
+ // No double decoding
+ ASSERT_EQ(decodeHtmlEntities("&amp;lt;"), "&lt;");
+
+ ASSERT_EQ(decodeHtmlEntities("&lt;&gt;"), "<>");
+
+ ASSERT_EQ(decodeHtmlEntities("1&lt;2"), "1<2");
+
+ ASSERT_EQ(decodeHtmlEntities("3&5&gt;3/5"), "3&5>3/5");
+
+ ASSERT_EQ(
+ decodeHtmlEntities("Q&amp;A stands for &quot;Questions and answers&quot;"),
+ "Q&A stands for \"Questions and answers\""
+ );
+}
+
+std::string links2Str(const std::vector<html_link>& links)
+{
+ std::ostringstream oss;
+ const char* sep = "";
+ for ( const auto& l : links ) {
+ oss << sep << "{ " << l.attribute << ", " << l.link << " }";
+ sep = "\n";
+ }
+ return oss.str();
+}
+
+#define EXPECT_LINKS(html, expectedStr) \
+ ASSERT_EQ(links2Str(generic_getLinks(html)), expectedStr)
+
 TEST(tools, getLinks)
 {
- auto v = generic_getLinks("");
+ EXPECT_LINKS(
+ "",
+ ""
+ );
+
+ EXPECT_LINKS(
+ R"(<link href="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
+ "{ href, https://fonts.io/css?family=OpenSans }"
+ );
 
- ASSERT_TRUE(v.empty());
+ EXPECT_LINKS(
+ R"(<link href='https://fonts.io/css?family=OpenSans' rel="stylesheet">)",
+ "{ href, https://fonts.io/css?family=OpenSans }"
+ );
 
- std::string page1 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
- auto v1 = generic_getLinks(page1);
+ EXPECT_LINKS(
+ R"(<link src="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
+ "{ src, https://fonts.io/css?family=OpenSans }"
+ );
 
- ASSERT_TRUE(v1.size() == 1);
- ASSERT_EQ(v1[0].attribute, "href");
- ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans");
+ // URI-decoding is NOT performed on extracted links
+ // (that's normalize_link()'s job)
+ EXPECT_LINKS(
+ "<audio controls src ='/music/It&apos;s%20only%20love.ogg'></audio>",
+ "{ src, /music/It's%20only%20love.ogg }"
+ );
 
- std::string page2 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
- auto v2 = generic_getLinks(page2);
+ EXPECT_LINKS(
+ R"(<a href="/R&amp;D">Research and development</a>
+ blablabla
+ <a href="../syntax/&lt;script&gt;">&lt;script&gt;</a>
+ ...
+ <a href="/Presidents/Dwight_&quot;Ike&quot;_Eisenhower">#34</a>
+ <img src="https://example.com/getlogo?w=640&amp;h=480">
+ )",
+ "{ href, /R&D }" "\n"
+ "{ href, ../syntax/<script> }" "\n"
+ "{ href, /Presidents/Dwight_\"Ike\"_Eisenhower }" "\n"
+ "{ src, https://example.com/getlogo?w=640&h=480 }"
+ );
 
- ASSERT_TRUE(v2.size() == 1);
- ASSERT_EQ(v1[0].attribute, "href");
+ // Known issue - HTML is not parsed and therefore false links
+ // may be returned
+ EXPECT_LINKS(
+ R"(
+<html>
+ <head>
+ <link src = "/css/stylesheet.css" rel="stylesheet">
+ <link rel="icon" href = '/favicon.ico'>
+ </head>
+ <body>
+ <img src="../img/welcome.png">
+ <!--
+ <a href="commented_out_link.htm"></a>
+ <img src="commented_out_image.png">
+ -->
+ <pre>
+ &lt;a href="not_a_link_in_example_code_block.htm"&gt;&lt;/a&gt;
+ &lt;img src="not_a_link_in_example_code_block.png"&gt;
+ </pre>
+ Powered by <a target="_blank" href="https://kiwix.org">Kiwix</a>.
+ </body>
+</html>
+)",
+ // links
+ "{ src, /css/stylesheet.css }" "\n"
+ "{ href, /favicon.ico }" "\n"
+ "{ src, ../img/welcome.png }" "\n"
+ "{ href, commented_out_link.htm }" "\n"
+ "{ src, commented_out_image.png }" "\n"
+ "{ href, not_a_link_in_example_code_block.htm }" "\n"
+ "{ src, not_a_link_in_example_code_block.png }" "\n"
+ "{ href, https://kiwix.org }"
+ );
 
- std::string page3 = "<link src=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
- auto v3 = generic_getLinks(page3);
+ // Despite HTML not being properly parsed, not every href or src followed
+ // by an equality sign (with optional whitespace in between) results in a
+ // link
+ EXPECT_LINKS(
+ "abcd href = qwerty src={123} xyz",
+ ""
+ );
 
- ASSERT_TRUE(v3.size() == 1);
- ASSERT_EQ(v3[0].attribute, "src");
- ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans");
 }
+#undef EXPECT_LINKS
 
 TEST(tools, httpRedirectHtml)
 {