Skip to content

Commit

Permalink
URL::parse fixes for empty paths (#7119)
Browse files Browse the repository at this point in the history
URL::parse could not handle URLs with empty paths, such as the
following:

http://www.example.com?name=something

Note the lack of '/' after the hostname.

This updates the parsing and printing logic to be able to handle this
while being careful not to break current parse expectations.

(cherry picked from commit 4e37533)
  • Loading branch information
bneradt authored and zwoop committed Sep 3, 2020
1 parent 53ddfaf commit 998d61e
Show file tree
Hide file tree
Showing 14 changed files with 690 additions and 62 deletions.
2 changes: 1 addition & 1 deletion doc/developer-guide/api/functions/TSUrlHostGet.en.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ and retrieve or modify parts of URLs, such as their host, port or scheme
information.

:func:`TSUrlSchemeGet`, :func:`TSUrlUserGet`, :func:`TSUrlPasswordGet`,
:func:`TSUrlHostGet`, :func:`TSUrlHttpParamsGet`, :func:`TSUrlHttpQueryGet`
:func:`TSUrlHostGet`, :func:`TSUrlPathGet`, :func:`TSUrlHttpParamsGet`, :func:`TSUrlHttpQueryGet`
and :func:`TSUrlHttpFragmentGet` each retrieve an internal pointer to the
specified portion of the URL from the marshall buffer :arg:`bufp`. The length
of the returned string is placed in :arg:`length` and a pointer to the URL
Expand Down
4 changes: 2 additions & 2 deletions include/ts/ts.h
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ tsapi int TSUrlLengthGet(TSMBuffer bufp, TSMLoc offset);
string in the parameter length. This is the same length that
TSUrlLengthGet() returns. The returned string is allocated by a
call to TSmalloc(). It should be freed by a call to TSfree().
The length parameter must present, providing storage for the URL
string length value.
The length parameter must be present, providing storage for the
URL string length value.
Note: To get the effective URL from a request, use the alternative
TSHttpTxnEffectiveUrlStringGet or
TSHttpHdrEffectiveUrlBufGet APIs.
Expand Down
4 changes: 3 additions & 1 deletion plugins/header_rewrite/operators.cc
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,9 @@ OperatorSetRedirect::exec(const Resources &res) const
const char *end = value.size() + start;
if (remap) {
// Set new location.
TSUrlParse(bufp, url_loc, &start, end);
if (TS_PARSE_ERROR == TSUrlParse(bufp, url_loc, &start, end)) {
TSDebug(PLUGIN_NAME, "Could not set Location field value to: %s", value.c_str());
}
// Set the new status.
TSHttpTxnStatusSet(res.txnp, static_cast<TSHttpStatus>(_status.get_int_value()));
const_cast<Resources &>(res).changed_url = true;
Expand Down
108 changes: 83 additions & 25 deletions proxy/hdrs/URL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,11 @@ url_copy_onto_as_server_url(URLImpl *s_url, HdrHeap *s_heap, URLImpl *d_url, Hdr
{
url_nuke_proxy_stuff(d_url);

d_url->m_ptr_path = s_url->m_ptr_path;
d_url->m_ptr_params = s_url->m_ptr_params;
d_url->m_ptr_query = s_url->m_ptr_query;
d_url->m_ptr_fragment = s_url->m_ptr_fragment;
d_url->m_ptr_path = s_url->m_ptr_path;
d_url->m_path_is_empty = s_url->m_path_is_empty;
d_url->m_ptr_params = s_url->m_ptr_params;
d_url->m_ptr_query = s_url->m_ptr_query;
d_url->m_ptr_fragment = s_url->m_ptr_fragment;
url_clear_string_ref(d_url);

d_url->m_len_path = s_url->m_len_path;
Expand Down Expand Up @@ -827,9 +828,13 @@ url_length_get(URLImpl *url)
}

if (url->m_ptr_path) {
length += url->m_len_path + 1; // +1 for /
} else {
length += 1; // +1 for /
length += url->m_len_path;
}

if (!url->m_path_is_empty) {
// m_ptr_path does not contain the initial "/" and thus m_len_path does not
// count it. We account for it here.
length += 1; // +1 for "/"
}

if (url->m_ptr_params && url->m_len_params > 0) {
Expand Down Expand Up @@ -1185,26 +1190,30 @@ url_is_strictly_compliant(const char *start, const char *end)
using namespace UrlImpl;

ParseResult
url_parse(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p, bool strict_uri_parsing)
url_parse(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p, bool strict_uri_parsing,
bool verify_host_characters)
{
if (strict_uri_parsing && !url_is_strictly_compliant(*start, end)) {
return PARSE_RESULT_ERROR;
}

ParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p);
return PARSE_RESULT_CONT == zret ? url_parse_http(heap, url, start, end, copy_strings_p) : zret;
return PARSE_RESULT_CONT == zret ? url_parse_http(heap, url, start, end, copy_strings_p, verify_host_characters) : zret;
}

ParseResult
url_parse_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p)
url_parse_regex(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p)
{
ParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p);
return PARSE_RESULT_CONT == zret ? url_parse_http_no_path_component_breakdown(heap, url, start, end, copy_strings_p) : zret;
return PARSE_RESULT_CONT == zret ? url_parse_http_regex(heap, url, start, end, copy_strings_p) : zret;
}

/**
Parse internet URL.
After this function completes, start will point to the first character after the
host or @a end if there are not characters after it.
@verbatim
[://][user[:password]@]host[:port]
Expand All @@ -1219,7 +1228,8 @@ url_parse_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const char **
*/

ParseResult
url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *end, bool copy_strings_p)
url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *end, bool copy_strings_p,
bool verify_host_characters)
{
const char *cur = *start;
const char *base; // Base for host/port field.
Expand Down Expand Up @@ -1296,8 +1306,14 @@ url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *
bracket = cur; // location and flag.
++cur;
break;
case '/': // we're done with this phase.
end = cur; // cause loop exit
// RFC 3986, section 3.2:
// The authority component is ... terminated by the next slash ("/"),
// question mark ("?"), or number sign ("#") character, or by the end of
// the URI.
case '/':
case '?':
case '#':
end = cur; // We're done parsing authority, cause loop exit.
break;
default:
++cur;
Expand All @@ -1324,7 +1340,7 @@ url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *
}
}
if (host._size) {
if (validate_host_name(std::string_view(host._ptr, host._size))) {
if (!verify_host_characters || validate_host_name(std::string_view(host._ptr, host._size))) {
url_host_set(heap, url, host._ptr, host._size, copy_strings_p);
} else {
return PARSE_RESULT_ERROR;
Expand All @@ -1339,9 +1355,6 @@ url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *
}
url_port_set(heap, url, port._ptr, port._size, copy_strings_p);
}
if ('/' == *cur) {
++cur; // must do this after filling in host/port.
}
*start = cur;
return PARSE_RESULT_DONE;
}
Expand All @@ -1352,7 +1365,7 @@ url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *
// empties params/query/fragment component

ParseResult
url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings)
url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings, bool verify_host_characters)
{
ParseResult err;
const char *cur;
Expand All @@ -1366,18 +1379,28 @@ url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end,
const char *fragment_end = nullptr;
char mask;

err = url_parse_internet(heap, url, start, end, copy_strings);
err = url_parse_internet(heap, url, start, end, copy_strings, verify_host_characters);
if (err < 0) {
return err;
}

cur = *start;
if (url->m_ptr_host == nullptr && ((end - cur) >= 2) && '/' == *cur && '/' == *(cur + 1)) {
// RFC 3986 section-3.3:
// If a URI does not contain an authority component, then the path cannot
// begin with two slash characters ("//").
return PARSE_RESULT_ERROR;
}
bool nothing_after_host = false;
if (*start == end) {
nothing_after_host = true;
goto done;
}

path_start = cur;
mask = ';' & '?' & '#';
if (*cur == '/') {
path_start = cur;
}
mask = ';' & '?' & '#';
parse_path2:
if ((*cur & mask) == mask) {
if (*cur == ';') {
Expand Down Expand Up @@ -1431,10 +1454,42 @@ url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end,

done:
if (path_start) {
// There was an explicit path set with '/'.
if (!path_end) {
path_end = cur;
}
if (path_start == path_end) {
url->m_path_is_empty = true;
} else {
url->m_path_is_empty = false;
// Per RFC 3986 section 3, the query string does not contain the initial
// '?' nor does the fragment contain the initial '#'. The path however
// does contain the initial '/' and a path can be empty, containing no
// characters at all, not even the initial '/'. Our path_get interface,
// however, has long not behaved accordingly, returning only the
// characters after the first '/'. This does not allow users to tell
// whether the path was absolutely empty. Further, callers have to
// account for the missing first '/' character themselves, either in URL
// length calculations or when piecing together their own URL. There are
// various examples of this in core and in the plugins shipped with Traffic
// Server.
//
// Correcting this behavior by having path_get return the entire path,
// (inclusive of any first '/') and an empty string if there were no
// characters specified in the path would break existing functionality,
// including various plugins that expect this behavior. Rather than
// correcting this behavior, therefore, we maintain the current
// functionality but add state to determine whether the path was
// absolutely empty so we can reconstruct such URLs.
++path_start;
}
url_path_set(heap, url, path_start, path_end - path_start, copy_strings);
} else if (!nothing_after_host) {
// There was no path set via '/': it is absolutely empty. However, if there
// is no path, query, or fragment after the host, we by convention add a
// slash after the authority. Users of URL expect this behavior. Thus the
// nothing_after_host check.
url->m_path_is_empty = true;
}
if (params_start) {
if (!params_end) {
Expand All @@ -1443,12 +1498,14 @@ url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end,
url_params_set(heap, url, params_start, params_end - params_start, copy_strings);
}
if (query_start) {
// There was a query string marked by '?'.
if (!query_end) {
query_end = cur;
}
url_query_set(heap, url, query_start, query_end - query_start, copy_strings);
}
if (fragment_start) {
// There was a fragment string marked by '#'.
if (!fragment_end) {
fragment_end = cur;
}
Expand All @@ -1460,7 +1517,7 @@ url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end,
}

ParseResult
url_parse_http_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings)
url_parse_http_regex(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings)
{
const char *cur = *start;
const char *host_end;
Expand Down Expand Up @@ -1572,8 +1629,9 @@ url_print(URLImpl *url, char *buf_start, int buf_length, int *buf_index_inout, i
}
}

TRY(mime_mem_print("/", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));

if (!url->m_path_is_empty) {
TRY(mime_mem_print("/", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
}
if (url->m_ptr_path) {
TRY(mime_mem_print(url->m_ptr_path, url->m_len_path, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
}
Expand Down
Loading

0 comments on commit 998d61e

Please sign in to comment.