fangfufu · fangfufu · Sep 29, 2023 · Sep 3, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/src/link.c b/src/link.c
@@ -27,6 +27,7 @@ int ROOT_LINK_OFFSET = 0;
  * effectively gives LinkTable generation priority over file transfer.
  */
 static pthread_mutex_t link_lock;
+static void make_link_relative(const char *page_url, char *link_url);
 
 /**
  * \brief create a new Link
@@ -36,6 +37,7 @@ static Link *Link_new(const char *linkname, LinkType type)
  Link *link = CALLOC(1, sizeof(Link));
 
  strncpy(link->linkname, linkname, MAX_FILENAME_LEN);
+ strncpy(link->linkpath, linkname, MAX_FILENAME_LEN);
  link->type = type;
 
  /*
@@ -51,6 +53,7 @@ static Link *Link_new(const char *linkname, LinkType type)
 
 static CURL *Link_to_curl(Link *link)
 {
+ lprintf(debug, "%s\n", link->f_url);
  CURL *curl = curl_easy_init();
  if (!curl) {
  lprintf(fatal, "curl_easy_init() failed!\n");
@@ -184,6 +187,7 @@ static CURL *Link_to_curl(Link *link)
 
 static void Link_req_file_stat(Link *this_link)
 {
+ lprintf(debug, "%s\n", this_link->f_url);
  CURL *curl = Link_to_curl(this_link);
  CURLcode ret = curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
  if (ret) {
@@ -269,26 +273,20 @@ static LinkTable *single_LinkTable_new(const char *url)
  return linktbl;
 }
 
-LinkTable *LinkSystem_init(const char *raw_url)
+LinkTable *LinkSystem_init(const char *url)
 {
  if (pthread_mutex_init(&link_lock, NULL)) {
  lprintf(error, "link_lock initialisation failed!\n");
  }
- /*
- * Remove excess '/' if it is there
- */
- char *url = strdup(raw_url);
  int url_len = strnlen(url, MAX_PATH_LEN) - 1;
- if (url[url_len] == '/') {
- url[url_len] = '\0';
- }
  /*
  * --------- Set the length of the root link -----------
  */
  /*
  * This is where the '/' should be
  */
- ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN);
+ ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN) -
+ ((url[url_len] == '/') ? 1 : 0);
 
  /*
  * --------------------- Enable cache system --------------------
@@ -319,7 +317,6 @@ LinkTable *LinkSystem_init(const char *raw_url)
  } else {
  lprintf(fatal, "Invalid CONFIG.mode\n");
  }
- FREE(url);
  return ROOT_LINK_TBL;
 }
 
@@ -388,7 +385,8 @@ static int linknames_equal(char *linkname, const char *linkname_new)
  * Shamelessly copied and pasted from:
  * https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
  */
-static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
+static void HTML_to_LinkTable(const char *url, GumboNode *node,
+ LinkTable *linktbl)
 {
  if (node->type != GUMBO_NODE_ELEMENT) {
  return;
@@ -397,31 +395,33 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
  if (node->v.element.tag == GUMBO_TAG_A &&
  (href =
  gumbo_get_attribute(&node->v.element.attributes, "href"))) {
+ char *link_url = href->value;
+ make_link_relative(url, link_url);
  /*
  * if it is valid, copy the link onto the heap
  */
- LinkType type = linkname_to_LinkType(href->value);
+ LinkType type = linkname_to_LinkType(link_url);
  /*
  * We also check if the link being added is the same as the last link.
  * This is to prevent duplicated link, if an Apache server has the
  * IconsAreLinks option.
  */
- size_t comp_len = strnlen(href->value, MAX_FILENAME_LEN);
+ size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN);
  if (type == LINK_DIR) {
  comp_len--;
  }
  if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
  !linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
- href->value)) {
- LinkTable_add(linktbl, Link_new(href->value, type));
+ link_url)) {
+ LinkTable_add(linktbl, Link_new(link_url, type));
  }
  }
  /*
  * Note the recursive call, lol.
  */
  GumboVector *children = &node->v.element.children;
  for (size_t i = 0; i < children->length; ++i) {
- HTML_to_LinkTable((GumboNode *) children->data[i], linktbl);
+ HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl);
  }
  return;
 }
@@ -465,22 +465,40 @@ void Link_set_file_stat(Link *this_link, CURL *curl)
 
 static void LinkTable_fill(LinkTable *linktbl)
 {
+ CURL *c = curl_easy_init();
  Link *head_link = linktbl->links[0];
+ lprintf(debug, "Filling %s\n", head_link->f_url);
  for (int i = 1; i < linktbl->num; i++) {
  Link *this_link = linktbl->links[i];
- char *url;
- url = path_append(head_link->f_url, this_link->linkname);
+ /* Some web sites use characters in their href attributes that really
+ shouldn't be in their href attributes, most commonly spaces. And
+ some web sites _do_ properly encode their href attributes. So we
+ first unescape the link path, and then we escape it, so that curl
+ will definitely be happy with it (e.g., curl won't accept URLs with
+ spaces in them!). If we only escaped it, and there were already
+ encoded characters in it, then that would break the link. */
+ char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0,
+ NULL);
+ char *escaped_path = curl_easy_escape(c, unescaped_path, 0);
+ curl_free(unescaped_path);
+ /* Our code does the wrong thing if there's a trailing slash that's been
+ replaced with %2F, which curl_easy_escape does, God bless it, so if
+ it did that then let's put it back. */
+ int escaped_len = strlen(escaped_path);
+ if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F"))
+ strcpy(escaped_path + escaped_len - 3, "/");
+ char *url = path_append(head_link->f_url, escaped_path);
+ curl_free(escaped_path);
  strncpy(this_link->f_url, url, MAX_PATH_LEN);
  FREE(url);
  char *unescaped_linkname;
- CURL *c = curl_easy_init();
  unescaped_linkname = curl_easy_unescape(c, this_link->linkname,
  0, NULL);
  strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN);
  curl_free(unescaped_linkname);
- curl_easy_cleanup(c);
  }
  LinkTable_uninitialised_fill(linktbl);
+ curl_easy_cleanup(c);
 }
 
 /**
@@ -574,7 +592,7 @@ LinkTable *LinkTable_new(const char *url)
  * Otherwise parsed the received data
  */
  GumboOutput *output = gumbo_parse(ts.data);
- HTML_to_LinkTable(output->root, linktbl);
+ HTML_to_LinkTable(url, output->root, linktbl);
  gumbo_destroy_output(&kGumboDefaultOptions, output);
  FREE(ts.data);
 
@@ -1064,3 +1082,51 @@ long path_download(const char *path, char *output_buf, size_t req_size,
 
  return Link_download(link, output_buf, req_size, offset);
 }
+
+static void make_link_relative(const char *page_url, char *link_url) 
+{
+ /*
+ Some servers make the links to subdirectories absolute, but our code
+ expects them to be relative, so change the contents of link_url as
+ needed to accommodate that.
+ */
+ if (link_url[0] != '/') {
+ /* Already relative, nothing to do here! */
+ return;
+ }
+
+ /* Find the slash after the host name. */
+ int slashes_left_to_find = 3;
+ while (*page_url) {
+ if (*page_url == '/' && ! --slashes_left_to_find)
+ break;
+ /* N.B. This is here, rather than doing `while (*page_url++)`, because
+ when we're done we want the pointer to point at the final slash. */
+ page_url++;
+ }
+ if (slashes_left_to_find)
+ if (! *page_url)
+ /* We're at the top level of the web site and the user entered the URL
+ without a trailing slash. */
+ page_url = "/";
+ else
+ /* Well, that's odd. Let's return rather than trying to dig ourselves
+ deeper into whatever hole we're in. */
+ return;
+ /* The page URL is no longer the full page_url, it's just the part after
+ the host name.
+ /* The link URL should start with the page URL. */
+ if (strstr(link_url, page_url) != link_url)
+ return;
+ int skip_len = strlen(page_url);
+ if (page_url[skip_len-1] != '/') {
+ if (page_url[skip_len] != '/')
+ /* Um, I'm not sure what to do here, so give up. */
+ return;
+ skip_len++;
+ }
+ /* Move the part of the link URL after the parent page's pat to
+ the beginning of the link URL string, discarding what came
+ before it. */
+ memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
+}
diff --git a/src/link.h b/src/link.h
@@ -43,6 +43,7 @@ struct LinkTable {
 struct Link {
  /** \brief The link name in the last level of the URL */
  char linkname[MAX_FILENAME_LEN + 1];
+ char linkpath[MAX_FILENAME_LEN + 1];
  /** \brief The full URL of the file */
  char f_url[MAX_PATH_LEN + 1];
  /** \brief The type of the link */

diff --git a/src/main.c b/src/main.c
@@ -225,6 +225,7 @@ parse_arg_list(int argc, char **argv, char ***fuse_argv, int *fuse_argc)
  return 1;
  case 'd':
  add_arg(fuse_argv, fuse_argc, "-d");
+ CONFIG.log_type |= debug;
  break;
  case 'f':
  add_arg(fuse_argv, fuse_argc, "-f");