From fc857d6b688a4b68f84780e163f775381ddafdb1 Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 13:56:02 -0400 Subject: [PATCH 1/5] Do the right thing with sites that require the final slash Some web sites will return 404 if you fetch a directory without the final slash. For example, https://archive.mozilla.org/pub/ works, https://archive.mozilla.org/pub does not. We need to do two things to accommodate this: * When processing the root URL of the filesystem, instead of stripping off the final slash, just set the offset to ignore it. * In the link structure, store the actual URL tail of the link separately from its name, final slash and all if there is one, and append that instead of the name when constructing the URL for curl. --- src/link.c | 16 +++++----------- src/link.h | 1 + 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/link.c b/src/link.c index 659f186..7b305d5 100644 --- a/src/link.c +++ b/src/link.c @@ -36,6 +36,7 @@ static Link *Link_new(const char *linkname, LinkType type) Link *link = CALLOC(1, sizeof(Link)); strncpy(link->linkname, linkname, MAX_FILENAME_LEN); + strncpy(link->linkpath, linkname, MAX_FILENAME_LEN); link->type = type; /* @@ -269,26 +270,20 @@ static LinkTable *single_LinkTable_new(const char *url) return linktbl; } -LinkTable *LinkSystem_init(const char *raw_url) +LinkTable *LinkSystem_init(const char *url) { if (pthread_mutex_init(&link_lock, NULL)) { lprintf(error, "link_lock initialisation failed!\n"); } - /* - * Remove excess '/' if it is there - */ - char *url = strdup(raw_url); int url_len = strnlen(url, MAX_PATH_LEN) - 1; - if (url[url_len] == '/') { - url[url_len] = '\0'; - } /* * --------- Set the length of the root link ----------- */ /* * This is where the '/' should be */ - ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN); + ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN) - + ((url[url_len] == '/') ? 1 : 0); /* * --------------------- Enable cache system -------------------- @@ -319,7 +314,6 @@ LinkTable *LinkSystem_init(const char *raw_url) } else { lprintf(fatal, "Invalid CONFIG.mode\n"); } - FREE(url); return ROOT_LINK_TBL; } @@ -469,7 +463,7 @@ static void LinkTable_fill(LinkTable *linktbl) for (int i = 1; i < linktbl->num; i++) { Link *this_link = linktbl->links[i]; char *url; - url = path_append(head_link->f_url, this_link->linkname); + url = path_append(head_link->f_url, this_link->linkpath); strncpy(this_link->f_url, url, MAX_PATH_LEN); FREE(url); char *unescaped_linkname; diff --git a/src/link.h b/src/link.h index 91372d2..73c30dc 100644 --- a/src/link.h +++ b/src/link.h @@ -43,6 +43,7 @@ struct LinkTable { struct Link { /** \brief The link name in the last level of the URL */ char linkname[MAX_FILENAME_LEN + 1]; + char linkpath[MAX_FILENAME_LEN + 1]; /** \brief The full URL of the file */ char f_url[MAX_PATH_LEN + 1]; /** \brief The type of the link */ From c2a0283795dcd24e2304c94323386d0ed9dba5fc Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 14:53:12 -0400 Subject: [PATCH 2/5] Do the right thing with sites that use absolute links On some sites, the link to each subfolder is an absolute link rather than a relative one. To accommodate this, convert the links from absolute to relative before storing them in the link table. --- src/link.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/src/link.c b/src/link.c index 7b305d5..1631550 100644 --- a/src/link.c +++ b/src/link.c @@ -27,6 +27,7 @@ int ROOT_LINK_OFFSET = 0; * effectively gives LinkTable generation priority over file transfer. */ static pthread_mutex_t link_lock; +static void make_link_relative(const char *page_url, char *link_url); /** * \brief create a new Link @@ -382,7 +383,8 @@ static int linknames_equal(char *linkname, const char *linkname_new) * Shamelessly copied and pasted from: * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc */ -static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) +static void HTML_to_LinkTable(const char *url, GumboNode *node, + LinkTable *linktbl) { if (node->type != GUMBO_NODE_ELEMENT) { return; @@ -391,23 +393,25 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) if (node->v.element.tag == GUMBO_TAG_A && (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) { + char *link_url = href->value; + make_link_relative(url, link_url); /* * if it is valid, copy the link onto the heap */ - LinkType type = linkname_to_LinkType(href->value); + LinkType type = linkname_to_LinkType(link_url); /* * We also check if the link being added is the same as the last link. * This is to prevent duplicated link, if an Apache server has the * IconsAreLinks option. */ - size_t comp_len = strnlen(href->value, MAX_FILENAME_LEN); + size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN); if (type == LINK_DIR) { comp_len--; } if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) && !linknames_equal(linktbl->links[linktbl->num - 1]->linkname, - href->value)) { - LinkTable_add(linktbl, Link_new(href->value, type)); + link_url)) { + LinkTable_add(linktbl, Link_new(link_url, type)); } } /* @@ -415,7 +419,7 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl) */ GumboVector *children = &node->v.element.children; for (size_t i = 0; i < children->length; ++i) { - HTML_to_LinkTable((GumboNode *) children->data[i], linktbl); + HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl); } return; } @@ -568,7 +572,7 @@ LinkTable *LinkTable_new(const char *url) * Otherwise parsed the received data */ GumboOutput *output = gumbo_parse(ts.data); - HTML_to_LinkTable(output->root, linktbl); + HTML_to_LinkTable(url, output->root, linktbl); gumbo_destroy_output(&kGumboDefaultOptions, output); FREE(ts.data); @@ -1058,3 +1062,51 @@ long path_download(const char *path, char *output_buf, size_t req_size, return Link_download(link, output_buf, req_size, offset); } + +static void make_link_relative(const char *page_url, char *link_url) +{ + /* + Some servers make the links to subdirectories absolute, but our code + expects them to be relative, so change the contents of link_url as + needed to accommodate that. + */ + if (link_url[0] != '/') { + /* Already relative, nothing to do here! */ + return; + } + + /* Find the slash after the host name. */ + int slashes_left_to_find = 3; + while (*page_url) { + if (*page_url == '/' && ! --slashes_left_to_find) + break; + /* N.B. This is here, rather than doing `while (*page_url++)`, because + when we're done we want the pointer to point at the final slash. */ + page_url++; + } + if (slashes_left_to_find) + if (! *page_url) + /* We're at the top level of the web site and the user entered the URL + without a trailing slash. */ + page_url = "/"; + else + /* Well, that's odd. Let's return rather than trying to dig ourselves + deeper into whatever hole we're in. */ + return; + /* The page URL is no longer the full page_url, it's just the part after + the host name. + /* The link URL should start with the page URL. */ + if (strstr(link_url, page_url) != link_url) + return; + int skip_len = strlen(page_url); + if (page_url[skip_len-1] != '/') { + if (page_url[skip_len] != '/') + /* Um, I'm not sure what to do here, so give up. */ + return; + skip_len++; + } + /* Move the part of the link URL after the parent page's pat to + the beginning of the link URL string, discarding what came + before it. */ + memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1); +} From 97b9273a70ebbb34751181f0f4d431223d7062f6 Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 16:00:38 -0400 Subject: [PATCH 3/5] Enabling debugging on command line should enable debug logging I believe an appropriate expectation is that if the user enables debugging with a command-line flag, then that should also enable messagse designated as debug messages in the code to be printed. --- src/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.c b/src/main.c index d4827b0..b3848f5 100644 --- a/src/main.c +++ b/src/main.c @@ -225,6 +225,7 @@ parse_arg_list(int argc, char **argv, char ***fuse_argv, int *fuse_argc) return 1; case 'd': add_arg(fuse_argv, fuse_argc, "-d"); + CONFIG.log_type |= debug; break; case 'f': add_arg(fuse_argv, fuse_argc, "-f"); From 53d1c9c741067a44202c6860880e1a2b513fb7b7 Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 16:02:33 -0400 Subject: [PATCH 4/5] Add a few more debug messages to help trace program execution --- src/link.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/link.c b/src/link.c index 1631550..2122570 100644 --- a/src/link.c +++ b/src/link.c @@ -53,6 +53,7 @@ static Link *Link_new(const char *linkname, LinkType type) static CURL *Link_to_curl(Link *link) { + lprintf(debug, "%s\n", link->f_url); CURL *curl = curl_easy_init(); if (!curl) { lprintf(fatal, "curl_easy_init() failed!\n"); @@ -186,6 +187,7 @@ static CURL *Link_to_curl(Link *link) static void Link_req_file_stat(Link *this_link) { + lprintf(debug, "%s\n", this_link->f_url); CURL *curl = Link_to_curl(this_link); CURLcode ret = curl_easy_setopt(curl, CURLOPT_NOBODY, 1); if (ret) { @@ -464,6 +466,7 @@ void Link_set_file_stat(Link *this_link, CURL *curl) static void LinkTable_fill(LinkTable *linktbl) { Link *head_link = linktbl->links[0]; + lprintf(debug, "Filling %s\n", head_link->f_url); for (int i = 1; i < linktbl->num; i++) { Link *this_link = linktbl->links[i]; char *url; From 5f61aac7799d57e3288e866fa46070b12ef7698e Mon Sep 17 00:00:00 2001 From: Jonathan Kamens Date: Sun, 3 Sep 2023 16:03:00 -0400 Subject: [PATCH 5/5] Handle sites that put unencoded characters in URLs that curl dislikes Some sites put unencoded characters in their href attributes that really should be encoded, most notably spaces. Curl won't accept a URL with a space in it, and perhaps other such characters as well. Address this by properly encoding characters in URLs before feeding them to Curl. --- src/link.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/link.c b/src/link.c index 2122570..d82fa87 100644 --- a/src/link.c +++ b/src/link.c @@ -465,23 +465,40 @@ void Link_set_file_stat(Link *this_link, CURL *curl) static void LinkTable_fill(LinkTable *linktbl) { + CURL *c = curl_easy_init(); Link *head_link = linktbl->links[0]; lprintf(debug, "Filling %s\n", head_link->f_url); for (int i = 1; i < linktbl->num; i++) { Link *this_link = linktbl->links[i]; - char *url; - url = path_append(head_link->f_url, this_link->linkpath); + /* Some web sites use characters in their href attributes that really + shouldn't be in their href attributes, most commonly spaces. And + some web sites _do_ properly encode their href attributes. So we + first unescape the link path, and then we escape it, so that curl + will definitely be happy with it (e.g., curl won't accept URLs with + spaces in them!). If we only escaped it, and there were already + encoded characters in it, then that would break the link. */ + char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0, + NULL); + char *escaped_path = curl_easy_escape(c, unescaped_path, 0); + curl_free(unescaped_path); + /* Our code does the wrong thing if there's a trailing slash that's been + replaced with %2F, which curl_easy_escape does, God bless it, so if + it did that then let's put it back. */ + int escaped_len = strlen(escaped_path); + if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F")) + strcpy(escaped_path + escaped_len - 3, "/"); + char *url = path_append(head_link->f_url, escaped_path); + curl_free(escaped_path); strncpy(this_link->f_url, url, MAX_PATH_LEN); FREE(url); char *unescaped_linkname; - CURL *c = curl_easy_init(); unescaped_linkname = curl_easy_unescape(c, this_link->linkname, 0, NULL); strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN); curl_free(unescaped_linkname); - curl_easy_cleanup(c); } LinkTable_uninitialised_fill(linktbl); + curl_easy_cleanup(c); } /**