From 3f805c580bbad36943c2c17354ca66eeaf1ce326 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 14 Nov 2023 14:15:17 +0400 Subject: [PATCH 01/11] Slightly more meaningful tools.getLinks unit test --- test/tools-test.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/tools-test.cpp b/test/tools-test.cpp index 633dce64..b743b51b 100644 --- a/test/tools-test.cpp +++ b/test/tools-test.cpp @@ -270,11 +270,12 @@ TEST(tools, getLinks) ASSERT_EQ(v1[0].attribute, "href"); ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans"); - std::string page2 = ""; + std::string page2 = ""; auto v2 = generic_getLinks(page2); ASSERT_TRUE(v2.size() == 1); - ASSERT_EQ(v1[0].attribute, "href"); + ASSERT_EQ(v2[0].attribute, "href"); + ASSERT_EQ(v2[0].link, "https://fonts.goos.com/css?family=OpenSans"); std::string page3 = ""; auto v3 = generic_getLinks(page3); From 0c0a5d14645c78972a96dc73f8ee0e8ea4c61385 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 14 Nov 2023 14:18:29 +0400 Subject: [PATCH 02/11] More readable tools.getLinks unit-test --- test/tools-test.cpp | 51 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/test/tools-test.cpp b/test/tools-test.cpp index b743b51b..388086b8 100644 --- a/test/tools-test.cpp +++ b/test/tools-test.cpp @@ -257,33 +257,44 @@ TEST(tools, addler32) ASSERT_EQ(adler32(""), 1); } -TEST(tools, getLinks) -{ - auto v = generic_getLinks(""); - - ASSERT_TRUE(v.empty()); - std::string page1 = ""; - auto v1 = generic_getLinks(page1); +std::string links2Str(const std::vector& links) +{ + std::ostringstream oss; + const char* sep = ""; + for ( const auto& l : links ) { + oss << sep << "{ " << l.attribute << ", " << l.link << " }"; + sep = "\n"; + } + return oss.str(); +} - ASSERT_TRUE(v1.size() == 1); - ASSERT_EQ(v1[0].attribute, "href"); - ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans"); +#define EXPECT_LINKS(html, expectedStr) \ + ASSERT_EQ(links2Str(generic_getLinks(html)), expectedStr) - std::string page2 = ""; - auto v2 = generic_getLinks(page2); +TEST(tools, getLinks) +{ + EXPECT_LINKS( + "", + "" + ); - ASSERT_TRUE(v2.size() == 1); - ASSERT_EQ(v2[0].attribute, "href"); - ASSERT_EQ(v2[0].link, "https://fonts.goos.com/css?family=OpenSans"); + EXPECT_LINKS( + R"()", + "{ href, https://fonts.io/css?family=OpenSans }" + ); - std::string page3 = ""; - auto v3 = generic_getLinks(page3); + EXPECT_LINKS( + R"()", + "{ href, https://fonts.io/css?family=OpenSans }" + ); - ASSERT_TRUE(v3.size() == 1); - ASSERT_EQ(v3[0].attribute, "src"); - ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans"); + EXPECT_LINKS( + R"()", + "{ src, https://fonts.io/css?family=OpenSans }" + ); } +#undef EXPECT_LINKS TEST(tools, httpRedirectHtml) { From df1d32b3d44b2ffd61d26c28adb674a41646804f Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 14 Nov 2023 14:45:55 +0400 Subject: [PATCH 03/11] Demonstrating shortcomings of generic_getLinks() generic_getLinks() doesn't decode HTML entities. Besides it doens't parse HTML and therefore may extract false links. --- test/tools-test.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/tools-test.cpp b/test/tools-test.cpp index 388086b8..7fc0562d 100644 --- a/test/tools-test.cpp +++ b/test/tools-test.cpp @@ -293,6 +293,55 @@ TEST(tools, getLinks) R"()", "{ src, https://fonts.io/css?family=OpenSans }" ); + + // Known issue - HTML entities are not decoded + EXPECT_LINKS( + R"(Research and development + blablabla + <script> + ... + #34 + + )", + "{ href, /R&D }" "\n" + "{ href, ../syntax/<script> }" "\n" + "{ href, /Presidents/Dwight_"Ike"_Eisenhower }" "\n" + "{ src, https://example.com/getlogo?w=640&h=480 }" + ); + + // Known issue - HTML is not parsed and therefore false links + // may be returned + EXPECT_LINKS( + R"( + + + + + + + + +
+      <a href="not_a_link_in_example_code_block.htm"></a>
+      <img src="not_a_link_in_example_code_block.png">
+    
+ Powered by Kiwix. + + +)", + // links + "{ src, /css/stylesheet.css }" "\n" + "{ href, /favicon.ico }" "\n" + "{ src, ../img/welcome.png }" "\n" + "{ href, commented_out_link.htm }" "\n" + "{ src, commented_out_image.png }" "\n" + "{ href, not_a_link_in_example_code_block.htm }" "\n" + "{ src, not_a_link_in_example_code_block.png }" "\n" + "{ href, https://kiwix.org }" + ); } #undef EXPECT_LINKS From d4a0c13e63bb86d6361586e21713ffc8877cb964 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 14 Nov 2023 16:23:49 +0400 Subject: [PATCH 04/11] Enter decodeHtmlEntities() --- src/tools.cpp | 47 +++++++++++++++++++++++++++++++++++++++++++++ src/tools.h | 2 ++ test/tools-test.cpp | 38 ++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/src/tools.cpp b/src/tools.cpp index 04d28b75..f5c0ed03 100644 --- a/src/tools.cpp +++ b/src/tools.cpp @@ -276,6 +276,53 @@ void stripTitleInvalidChars(std::string& str) replaceStringInPlace(str, "\u202C", ""); } +namespace +{ + +const char* getHtmlEntity(const std::string& core) +{ + static const std::map t = { + { "amp", "&" }, + { "quot", "\"" }, + { "lt", "<" }, + { "gt", ">" }, + }; + + const auto it = t.find(core); + return it != t.end() ? it->second : nullptr; +} + +} // unnamed namespace + +std::string decodeHtmlEntities(const std::string& str) +{ + const char* p = str.c_str(); + std::string result; + const char* start = nullptr; + for ( ; *p ; ++p ) { + if ( *p == '&' ) { + if ( start ) { + result.insert(result.end(), start, p); + } + start = p; + } else if ( !start ) { + result.push_back(*p); + } else if ( *p == ';' ) { + const char* d = getHtmlEntity(std::string(start+1, p)); + if ( d ) { + result += d; + } else { + result.insert(result.end(), start, p+1); + } + start = nullptr; + } + } + if ( start ) { + result.insert(result.end(), start, p); + } + return result; +} + std::vector generic_getLinks(const std::string& page) { const char* p = page.c_str(); diff --git a/src/tools.h b/src/tools.h index c8315c23..513cb2e6 100644 --- a/src/tools.h +++ b/src/tools.h @@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base); //Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final. int adler32(const std::string& buf); +std::string decodeHtmlEntities(const std::string& str); + //Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it. //Converts the %20 to space.Essential for comparing URLs. std::string normalize_link(const std::string& input, const std::string& baseUrl); diff --git a/test/tools-test.cpp b/test/tools-test.cpp index 7fc0562d..1921fe94 100644 --- a/test/tools-test.cpp +++ b/test/tools-test.cpp @@ -257,6 +257,44 @@ TEST(tools, addler32) ASSERT_EQ(adler32(""), 1); } +TEST(tools, decodeHtmlEntities) +{ + ASSERT_EQ(decodeHtmlEntities(""), ""); + + // Supported HTML character references + ASSERT_EQ(decodeHtmlEntities("&"), "&"); + ASSERT_EQ(decodeHtmlEntities("""), "\""); + ASSERT_EQ(decodeHtmlEntities("<"), "<"); + ASSERT_EQ(decodeHtmlEntities(">"), ">"); + + // All other HTML character references + // (https://html.spec.whatwg.org/multipage/syntax.html#character-references) + // are NOT currently supported + ASSERT_EQ(decodeHtmlEntities("'"), "'"); // should be "'" + + // Capitalized versions of supported ones do NOT work + ASSERT_EQ(decodeHtmlEntities("&"), "&"); + ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;"); + + // HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded + ASSERT_EQ(decodeHtmlEntities("A"), "A" ); // should be "A" + ASSERT_EQ(decodeHtmlEntities("A"), "A"); // should be "A" + + // Handling of "incomplete" entity + ASSERT_EQ(decodeHtmlEntities("&"), "&"); + + // No double decoding + ASSERT_EQ(decodeHtmlEntities("&lt;"), "<"); + + ASSERT_EQ(decodeHtmlEntities("<>"), "<>"); + + ASSERT_EQ(decodeHtmlEntities("1<2"), "1<2"); + + ASSERT_EQ( + decodeHtmlEntities("Q&A stands for "Questions and answers""), + "Q&A stands for \"Questions and answers\"" + ); +} std::string links2Str(const std::vector& links) { From 8542b7d7d0ed016520746b1f493f5aeab010604e Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Tue, 14 Nov 2023 16:34:18 +0400 Subject: [PATCH 05/11] generic_getLinks() decodes HTML entities --- src/tools.cpp | 2 +- test/tools-test.cpp | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/tools.cpp b/src/tools.cpp index f5c0ed03..d7a66f5e 100644 --- a/src/tools.cpp +++ b/src/tools.cpp @@ -357,7 +357,7 @@ std::vector generic_getLinks(const std::string& page) while(*p != delimiter) p++; const std::string link(linkStart, p); - links.push_back(html_link(attr, link)); + links.push_back(html_link(attr, decodeHtmlEntities(link))); p += 1; } return links; diff --git a/test/tools-test.cpp b/test/tools-test.cpp index 1921fe94..e2cdbbc2 100644 --- a/test/tools-test.cpp +++ b/test/tools-test.cpp @@ -332,7 +332,6 @@ TEST(tools, getLinks) "{ src, https://fonts.io/css?family=OpenSans }" ); - // Known issue - HTML entities are not decoded EXPECT_LINKS( R"(Research and development blablabla @@ -341,10 +340,10 @@ TEST(tools, getLinks) #34 )", - "{ href, /R&D }" "\n" - "{ href, ../syntax/<script> }" "\n" - "{ href, /Presidents/Dwight_"Ike"_Eisenhower }" "\n" - "{ src, https://example.com/getlogo?w=640&h=480 }" + "{ href, /R&D }" "\n" + "{ href, ../syntax/