Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle sites that use absolute links and sites that require the final slash in the URL #121

Merged
merged 5 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 87 additions & 21 deletions src/link.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ int ROOT_LINK_OFFSET = 0;
* effectively gives LinkTable generation priority over file transfer.
*/
static pthread_mutex_t link_lock;
static void make_link_relative(const char *page_url, char *link_url);

/**
* \brief create a new Link
Expand All @@ -36,6 +37,7 @@ static Link *Link_new(const char *linkname, LinkType type)
Link *link = CALLOC(1, sizeof(Link));

strncpy(link->linkname, linkname, MAX_FILENAME_LEN);
strncpy(link->linkpath, linkname, MAX_FILENAME_LEN);
link->type = type;

/*
Expand All @@ -51,6 +53,7 @@ static Link *Link_new(const char *linkname, LinkType type)

static CURL *Link_to_curl(Link *link)
{
lprintf(debug, "%s\n", link->f_url);
CURL *curl = curl_easy_init();
if (!curl) {
lprintf(fatal, "curl_easy_init() failed!\n");
Expand Down Expand Up @@ -184,6 +187,7 @@ static CURL *Link_to_curl(Link *link)

static void Link_req_file_stat(Link *this_link)
{
lprintf(debug, "%s\n", this_link->f_url);
CURL *curl = Link_to_curl(this_link);
CURLcode ret = curl_easy_setopt(curl, CURLOPT_NOBODY, 1);
if (ret) {
Expand Down Expand Up @@ -269,26 +273,20 @@ static LinkTable *single_LinkTable_new(const char *url)
return linktbl;
}

LinkTable *LinkSystem_init(const char *raw_url)
LinkTable *LinkSystem_init(const char *url)
{
if (pthread_mutex_init(&link_lock, NULL)) {
lprintf(error, "link_lock initialisation failed!\n");
}
/*
* Remove excess '/' if it is there
*/
char *url = strdup(raw_url);
int url_len = strnlen(url, MAX_PATH_LEN) - 1;
if (url[url_len] == '/') {
url[url_len] = '\0';
}
/*
* --------- Set the length of the root link -----------
*/
/*
* This is where the '/' should be
*/
ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN);
ROOT_LINK_OFFSET = strnlen(url, MAX_PATH_LEN) -
((url[url_len] == '/') ? 1 : 0);

/*
* --------------------- Enable cache system --------------------
Expand Down Expand Up @@ -319,7 +317,6 @@ LinkTable *LinkSystem_init(const char *raw_url)
} else {
lprintf(fatal, "Invalid CONFIG.mode\n");
}
FREE(url);
return ROOT_LINK_TBL;
}

Expand Down Expand Up @@ -388,7 +385,8 @@ static int linknames_equal(char *linkname, const char *linkname_new)
* Shamelessly copied and pasted from:
* https://github.com/google/gumbo-parser/blob/master/examples/find_links.cc
*/
static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
static void HTML_to_LinkTable(const char *url, GumboNode *node,
LinkTable *linktbl)
{
if (node->type != GUMBO_NODE_ELEMENT) {
return;
Expand All @@ -397,31 +395,33 @@ static void HTML_to_LinkTable(GumboNode *node, LinkTable *linktbl)
if (node->v.element.tag == GUMBO_TAG_A &&
(href =
gumbo_get_attribute(&node->v.element.attributes, "href"))) {
char *link_url = href->value;
make_link_relative(url, link_url);
/*
* if it is valid, copy the link onto the heap
*/
LinkType type = linkname_to_LinkType(href->value);
LinkType type = linkname_to_LinkType(link_url);
/*
* We also check if the link being added is the same as the last link.
* This is to prevent duplicated link, if an Apache server has the
* IconsAreLinks option.
*/
size_t comp_len = strnlen(href->value, MAX_FILENAME_LEN);
size_t comp_len = strnlen(link_url, MAX_FILENAME_LEN);
if (type == LINK_DIR) {
comp_len--;
}
if (((type == LINK_DIR) || (type == LINK_UNINITIALISED_FILE)) &&
!linknames_equal(linktbl->links[linktbl->num - 1]->linkname,
href->value)) {
LinkTable_add(linktbl, Link_new(href->value, type));
link_url)) {
LinkTable_add(linktbl, Link_new(link_url, type));
}
}
/*
* Note the recursive call, lol.
*/
GumboVector *children = &node->v.element.children;
for (size_t i = 0; i < children->length; ++i) {
HTML_to_LinkTable((GumboNode *) children->data[i], linktbl);
HTML_to_LinkTable(url, (GumboNode *) children->data[i], linktbl);
}
return;
}
Expand Down Expand Up @@ -465,22 +465,40 @@ void Link_set_file_stat(Link *this_link, CURL *curl)

static void LinkTable_fill(LinkTable *linktbl)
{
CURL *c = curl_easy_init();
Link *head_link = linktbl->links[0];
lprintf(debug, "Filling %s\n", head_link->f_url);
for (int i = 1; i < linktbl->num; i++) {
Link *this_link = linktbl->links[i];
char *url;
url = path_append(head_link->f_url, this_link->linkname);
/* Some web sites use characters in their href attributes that really
shouldn't be in their href attributes, most commonly spaces. And
some web sites _do_ properly encode their href attributes. So we
first unescape the link path, and then we escape it, so that curl
will definitely be happy with it (e.g., curl won't accept URLs with
spaces in them!). If we only escaped it, and there were already
encoded characters in it, then that would break the link. */
char *unescaped_path = curl_easy_unescape(c, this_link->linkpath, 0,
NULL);
char *escaped_path = curl_easy_escape(c, unescaped_path, 0);
curl_free(unescaped_path);
/* Our code does the wrong thing if there's a trailing slash that's been
replaced with %2F, which curl_easy_escape does, God bless it, so if
it did that then let's put it back. */
int escaped_len = strlen(escaped_path);
if (escaped_len >= 3 && !strcmp(escaped_path + escaped_len - 3, "%2F"))
strcpy(escaped_path + escaped_len - 3, "/");
char *url = path_append(head_link->f_url, escaped_path);
curl_free(escaped_path);
strncpy(this_link->f_url, url, MAX_PATH_LEN);
FREE(url);
char *unescaped_linkname;
CURL *c = curl_easy_init();
unescaped_linkname = curl_easy_unescape(c, this_link->linkname,
0, NULL);
strncpy(this_link->linkname, unescaped_linkname, MAX_FILENAME_LEN);
curl_free(unescaped_linkname);
curl_easy_cleanup(c);
}
LinkTable_uninitialised_fill(linktbl);
curl_easy_cleanup(c);
}

/**
Expand Down Expand Up @@ -574,7 +592,7 @@ LinkTable *LinkTable_new(const char *url)
* Otherwise parsed the received data
*/
GumboOutput *output = gumbo_parse(ts.data);
HTML_to_LinkTable(output->root, linktbl);
HTML_to_LinkTable(url, output->root, linktbl);
gumbo_destroy_output(&kGumboDefaultOptions, output);
FREE(ts.data);

Expand Down Expand Up @@ -1064,3 +1082,51 @@ long path_download(const char *path, char *output_buf, size_t req_size,

return Link_download(link, output_buf, req_size, offset);
}

static void make_link_relative(const char *page_url, char *link_url)
{
/*
Some servers make the links to subdirectories absolute, but our code
expects them to be relative, so change the contents of link_url as
needed to accommodate that.
*/
if (link_url[0] != '/') {
/* Already relative, nothing to do here! */
return;
}

/* Find the slash after the host name. */
int slashes_left_to_find = 3;
while (*page_url) {
if (*page_url == '/' && ! --slashes_left_to_find)
break;
/* N.B. This is here, rather than doing `while (*page_url++)`, because
when we're done we want the pointer to point at the final slash. */
page_url++;
}
if (slashes_left_to_find)
if (! *page_url)
/* We're at the top level of the web site and the user entered the URL
without a trailing slash. */
page_url = "/";
else
/* Well, that's odd. Let's return rather than trying to dig ourselves
deeper into whatever hole we're in. */
return;
/* The page URL is no longer the full page_url, it's just the part after
the host name.
/* The link URL should start with the page URL. */
if (strstr(link_url, page_url) != link_url)
return;
int skip_len = strlen(page_url);
if (page_url[skip_len-1] != '/') {
if (page_url[skip_len] != '/')
/* Um, I'm not sure what to do here, so give up. */
return;
skip_len++;
}
/* Move the part of the link URL after the parent page's pat to
the beginning of the link URL string, discarding what came
before it. */
memmove(link_url, link_url + skip_len, strlen(link_url) - skip_len + 1);
}
1 change: 1 addition & 0 deletions src/link.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct LinkTable {
struct Link {
/** \brief The link name in the last level of the URL */
char linkname[MAX_FILENAME_LEN + 1];
char linkpath[MAX_FILENAME_LEN + 1];
/** \brief The full URL of the file */
char f_url[MAX_PATH_LEN + 1];
/** \brief The type of the link */
Expand Down
1 change: 1 addition & 0 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ parse_arg_list(int argc, char **argv, char ***fuse_argv, int *fuse_argc)
return 1;
case 'd':
add_arg(fuse_argv, fuse_argc, "-d");
CONFIG.log_type |= debug;
break;
case 'f':
add_arg(fuse_argv, fuse_argc, "-f");
Expand Down