Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decoding of HTML entities in links #383

Merged
merged 11 commits into from
Nov 15, 2023
66 changes: 57 additions & 9 deletions src/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,54 @@
replaceStringInPlace(str, "\u202C", "");
}

namespace
{

const char* getHtmlEntity(const std::string& core)
{
static const std::map<std::string, const char*> t = {
{ "amp", "&" },
{ "apos", "'" },
{ "quot", "\"" },
{ "lt", "<" },
{ "gt", ">" },

Check warning on line 289 in src/tools.cpp

View check run for this annotation

Codecov / codecov/patch

src/tools.cpp#L285-L289

Added lines #L285 - L289 were not covered by tests
};

const auto it = t.find(core);
return it != t.end() ? it->second : nullptr;
}

} // unnamed namespace

std::string decodeHtmlEntities(const std::string& str)
{
const char* p = str.c_str();
std::string result;
const char* start = nullptr;
for ( ; *p ; ++p ) {
if ( *p == '&' ) {
if ( start ) {
result.insert(result.end(), start, p);
}
start = p;
} else if ( !start ) {
result.push_back(*p);
} else if ( *p == ';' ) {
const char* d = getHtmlEntity(std::string(start+1, p));
if ( d ) {
result += d;
} else {
result.insert(result.end(), start, p+1);
}
start = nullptr;
}
}
if ( start ) {
result.insert(result.end(), start, p);
}
return result;
}

std::vector<html_link> generic_getLinks(const std::string& page)
{
const char* p = page.c_str();
Expand Down Expand Up @@ -310,7 +358,7 @@
while(*p != delimiter)
p++;
const std::string link(linkStart, p);
links.push_back(html_link(attr, link));
links.push_back(html_link(attr, decodeHtmlEntities(link)));
p += 1;
}
return links;
Expand Down Expand Up @@ -356,7 +404,6 @@
std::string output;
output.reserve(baseUrl.size() + input.size() + 1);

bool in_query = false;
bool check_rel = false;
const char* p = input.c_str();
if ( *(p) == '/') {
Expand All @@ -373,7 +420,7 @@
//URL Decoding.
while (*p)
{
if ( !in_query && check_rel ) {
if ( check_rel ) {
if (strncmp(p, "../", 3) == 0) {
// We must go "up"
// Remove the '/' at the end of output.
Expand All @@ -394,9 +441,13 @@
continue;
}
}
if ( *p == '#' || *p == '?')
// This is a beginning of the #anchor inside a page. No need to decode more

if ( *p == '#' || *p == '?') {
// For our purposes we can safely discard the query and/or fragment
// components of the URL
break;
}

if ( *p == '%')
{
char ch;
Expand All @@ -405,10 +456,7 @@
p += 3;
continue;
}
if ( *p == '?' ) {
// We are in the query, so don't try to interprete '/' as path separator
in_query = true;
}

if ( *p == '/') {
check_rel = true;
if (output.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions src/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base);
//Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final.
int adler32(const std::string& buf);

std::string decodeHtmlEntities(const std::string& str);

//Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it.
//Converts the %20 to space.Essential for comparing URLs.
std::string normalize_link(const std::string& input, const std::string& baseUrl);
Expand Down
152 changes: 136 additions & 16 deletions test/tools-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,9 @@

ASSERT_EQ(normalize_link("a", ""), "a");
ASSERT_EQ(normalize_link("./a", ""), "a");

// URI-decoding is performed
ASSERT_EQ(normalize_link("/%41%62c", "/"), "Abc");
}

TEST(tools, addler32)
Expand All @@ -257,32 +260,149 @@
ASSERT_EQ(adler32(""), 1);
}

TEST(tools, decodeHtmlEntities)
{
ASSERT_EQ(decodeHtmlEntities(""), "");

// Supported HTML character references
ASSERT_EQ(decodeHtmlEntities("&amp;"), "&");
ASSERT_EQ(decodeHtmlEntities("&apos;"), "'");
ASSERT_EQ(decodeHtmlEntities("&quot;"), "\"");
ASSERT_EQ(decodeHtmlEntities("&lt;"), "<");
ASSERT_EQ(decodeHtmlEntities("&gt;"), ">");

// All other HTML character references
// (https://html.spec.whatwg.org/multipage/syntax.html#character-references)
// are NOT currently supported
ASSERT_EQ(decodeHtmlEntities("&nbsp;"), "&nbsp;");

// Capitalized versions of supported ones do NOT work
ASSERT_EQ(decodeHtmlEntities("&AMP;"), "&AMP;");
ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;");

// HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded
ASSERT_EQ(decodeHtmlEntities("&#65;"), "&#65;" ); // should be "A"
ASSERT_EQ(decodeHtmlEntities("&#x41;"), "&#x41;"); // should be "A"

// Handling of "incomplete" entity
ASSERT_EQ(decodeHtmlEntities("&amp"), "&amp");

// No double decoding
ASSERT_EQ(decodeHtmlEntities("&amp;lt;"), "&lt;");

ASSERT_EQ(decodeHtmlEntities("&lt;&gt;"), "<>");

ASSERT_EQ(decodeHtmlEntities("1&lt;2"), "1<2");

ASSERT_EQ(decodeHtmlEntities("3&5&gt;3/5"), "3&5>3/5");

ASSERT_EQ(
decodeHtmlEntities("Q&amp;A stands for &quot;Questions and answers&quot;"),
"Q&A stands for \"Questions and answers\""
);
}

std::string links2Str(const std::vector<html_link>& links)
{
std::ostringstream oss;
const char* sep = "";
for ( const auto& l : links ) {
oss << sep << "{ " << l.attribute << ", " << l.link << " }";
sep = "\n";
}
return oss.str();
}

#define EXPECT_LINKS(html, expectedStr) \
ASSERT_EQ(links2Str(generic_getLinks(html)), expectedStr)

TEST(tools, getLinks)
{
auto v = generic_getLinks("");
EXPECT_LINKS(
"",
""
);

EXPECT_LINKS(
R"(<link href="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
"{ href, https://fonts.io/css?family=OpenSans }"
);

ASSERT_TRUE(v.empty());
EXPECT_LINKS(
R"(<link href='https://fonts.io/css?family=OpenSans' rel="stylesheet">)",
"{ href, https://fonts.io/css?family=OpenSans }"
);

std::string page1 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v1 = generic_getLinks(page1);
EXPECT_LINKS(
R"(<link src="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
"{ src, https://fonts.io/css?family=OpenSans }"
);

ASSERT_TRUE(v1.size() == 1);
ASSERT_EQ(v1[0].attribute, "href");
ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans");
// URI-decoding is NOT performed on extracted links
// (that's normalize_link()'s job)
EXPECT_LINKS(
"<audio controls src ='/music/It&apos;s%20only%20love.ogg'></audio>",
"{ src, /music/It's%20only%20love.ogg }"
);

std::string page2 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v2 = generic_getLinks(page2);
EXPECT_LINKS(
R"(<a href="/R&amp;D">Research and development</a>
blablabla
<a href="../syntax/&lt;script&gt;">&lt;script&gt;</a>
...
<a href="/Presidents/Dwight_&quot;Ike&quot;_Eisenhower">#34</a>
<img src="https://example.com/getlogo?w=640&amp;h=480">
)",
"{ href, /R&D }" "\n"
"{ href, ../syntax/<script> }" "\n"
"{ href, /Presidents/Dwight_\"Ike\"_Eisenhower }" "\n"
"{ src, https://example.com/getlogo?w=640&h=480 }"
);

ASSERT_TRUE(v2.size() == 1);
ASSERT_EQ(v1[0].attribute, "href");
// Known issue - HTML is not parsed and therefore false links
// may be returned
EXPECT_LINKS(
R"(
<html>
<head>
<link src = "/css/stylesheet.css" rel="stylesheet">
<link rel="icon" href = '/favicon.ico'>
</head>
<body>
<img src="../img/welcome.png">
<!--
<a href="commented_out_link.htm"></a>
<img src="commented_out_image.png">
-->
<pre>
&lt;a href="not_a_link_in_example_code_block.htm"&gt;&lt;/a&gt;
&lt;img src="not_a_link_in_example_code_block.png"&gt;
</pre>
Powered by <a target="_blank" href="https://kiwix.org">Kiwix</a>.
</body>
</html>
)",
// links
"{ src, /css/stylesheet.css }" "\n"
"{ href, /favicon.ico }" "\n"
"{ src, ../img/welcome.png }" "\n"
"{ href, commented_out_link.htm }" "\n"
"{ src, commented_out_image.png }" "\n"
"{ href, not_a_link_in_example_code_block.htm }" "\n"
"{ src, not_a_link_in_example_code_block.png }" "\n"
"{ href, https://kiwix.org }"
);

std::string page3 = "<link src=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v3 = generic_getLinks(page3);
// Despite HTML not being properly parsed, not every href or src followed
// by an equality sign (with optional whitespace in between) results in a
// link
EXPECT_LINKS(
"abcd href = qwerty src={123} xyz",
""

Check notice on line 401 in test/tools-test.cpp

View check run for this annotation

codefactor.io / CodeFactor

test/tools-test.cpp#L401

Redundant blank line at the end of a code block should be deleted. (whitespace/blank_line)
);

ASSERT_TRUE(v3.size() == 1);
ASSERT_EQ(v3[0].attribute, "src");
ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans");
}
#undef EXPECT_LINKS

TEST(tools, httpRedirectHtml)
{
Expand Down
Loading