Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

878 support raw urls #884

Merged
merged 6 commits into from
Apr 26, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
302 changes: 302 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,12 +537,314 @@ pub fn slugify_filename(url: &str, prefix: &str, suffix: &str) -> String {
filename
}

/// This function takes a url string and returns a `ParsedUrl` struct
///
/// It is primarily used to detect url paths that `url::Url::parse` will
/// silently transform, such as /path/../file.html -> /file.html
///
/// # Warning
///
/// In the instance of a url with encoded path traversal strings, such as
/// /path/%2e%2e/file.html, the underlying `url::Url::parse` will
/// further encode the %-signs and return /path/%252e%252e/file.html
pub fn parse_url_with_raw_path(url: &str) -> Result<Url> {
log::trace!("enter: parse_url_with_raw_path({})", url);

let parsed = Url::parse(url)?;

if !parsed.has_authority() {
// parsed correctly, but no authority, meaning mailto: or tel: or
// some other url that we don't care about
bail!("url to parse has no authority and is therefore invalid");
}

// we have a valid url, the next step is to check the path and see if it's
// something that url::Url::parse would silently transform
//
// i.e. if the path is /path/../file.html, url::Url::parse will transform it
// to /file.html, which is not what we want

let farthest_right_authority_part;

// we want to find the farthest right authority component, which is the
// component that is the furthest right in the url that is part of the
// authority
//
// per RFC 3986, the authority is defined as:
// - authority = [ userinfo "@" ] host [ ":" port ]
//
// so the farthest right authority component is either the port or the host
//
// i.e. in http://example.com:80/path/file.html, the farthest right authority
// component is :80
//
// in http://example.com/path/file.html, the farthest right authority component
// is example.com
//
// the farthest right authority component is used to split the url into two
// parts: the part before the authority and the part after the authority
if let Some(port) = parsed.port() {
// if the url has a port, then the farthest right authority component is
// the port
farthest_right_authority_part = format!(":{}", port);
} else if parsed.has_host() {
// if the url has a host, then the farthest right authority component is
// the host
farthest_right_authority_part = parsed.host_str().unwrap().to_owned();
} else {
// if the url has neither a port nor a host, then the url is invalid
// and we can't do anything with it, but i don't think this is possible
unreachable!("url has an authority, but has neither a port nor a host");
}

// split the original url string into two parts: the part before the authority and the part
// after the authority (i.e. the path + query + fragment)

let Some((_, after_authority)) = url.split_once(&farthest_right_authority_part) else {
// if we can't split the url string into two parts, then the url is invalid
// and we can't do anything with it
return Ok(parsed);
};

// when there is a port, but it matches the default port for the scheme,
// url::Url::parse will mark the port as None, giving us a
// `after_authority` that looks something like this:
// - :80/path/file.html
let after_authority = after_authority
.replacen(":80", "", 1)
.replacen(":443", "", 1);

// snippets from rfc-3986:
//
// foo://example.com:8042/over/there?name=ferret#nose
// \_/ \______________/\_________/ \_________/ \__/
// | | | | |
// scheme authority path query fragment
//
// The path component is terminated
// by the first question mark ("?") or number sign ("#") character, or
// by the end of the URI.
//
// The query component is indicated by the first question
// mark ("?") character and terminated by a number sign ("#") character
// or by the end of the URI.
let (path, _discarded) = after_authority
.split_once('?')
// if there isn't a '?', try to remove a fragment
.unwrap_or_else(|| {
// if there isn't a '#', return (original, empty)
after_authority
.split_once('#')
.unwrap_or((&after_authority, ""))
});

// at this point, we have the path, all by itself

// each of the following is a string that we can expect url::Url::parse to
// transform. The variety is to ensure we cover most common path traversal
// encodings
let transformation_detectors = vec![
// ascii
"..",
// single url encoded
"%2e%2e",
// double url encoded
"%25%32%65%25%32%65",
// utf-8 encoded
"%c0%ae%c0%ae",
"%e0%40%ae%e0%40%ae",
"%c0ae%c0ae",
// 16 bit shenanigans
"%uff0e%uff0e",
"%u002e%u002e",
];

let parsing_will_transform_path = transformation_detectors
.iter()
.any(|detector| path.to_lowercase().contains(detector));

if !parsing_will_transform_path {
// there's no string in the path of the url that will trigger a transformation
// so, we can return it as-is
return Ok(parsed);
}

// if we reach this point, the path contains a string that will trigger a transformation
// so we need to manually create a Url that doesn't have the transformation
// and return that
//
// special thanks to github user @lavafroth for this workaround

let mut hacked_url = if path.ends_with('/') {
// from_file_path silently strips trailing slashes, and
// from_directory_path adds them, so we'll choose the appropriate
// constructor based on the presence of a path's trailing slash

// according to from_file_path docs:
// from_file_path returns `Err` if the given path is not absolute or,
// on Windows, if the prefix is not a disk prefix (e.g. `C:`) or a UNC prefix (`\\`).
//
// since we parsed out a valid url path, we know it is absolute, so on non-windows
// platforms, we can safely unwrap. On windows, we need to fix up the path
#[cfg(target_os = "windows")]
{
Url::from_directory_path(path.replace("/", "\\")).unwrap()
epi052 marked this conversation as resolved.
Show resolved Hide resolved
}
#[cfg(not(target_os = "windows"))]
Url::from_directory_path(path).unwrap()
} else {
#[cfg(target_os = "windows")]
{
Url::from_file_path(path.replace("/", "\\")).unwrap()
epi052 marked this conversation as resolved.
Show resolved Hide resolved
}
#[cfg(not(target_os = "windows"))]
Url::from_file_path(path).unwrap()
};

// host must be set first, otherwise multiple components may return Err
hacked_url.set_host(parsed.host_str())?;
// scheme/port/username/password can fail, but in this instance, we know they won't
hacked_url.set_scheme(parsed.scheme()).unwrap();
hacked_url.set_port(parsed.port()).unwrap();
hacked_url.set_username(parsed.username()).unwrap();
hacked_url.set_password(parsed.password()).unwrap();
// query/fragment can't fail
hacked_url.set_query(parsed.query());
hacked_url.set_fragment(parsed.fragment());

log::trace!("exit: parse_url_with_raw_path -> {}", hacked_url);
Ok(hacked_url)
}

#[cfg(test)]
mod tests {
use super::*;
use crate::config::Configuration;
use crate::scan_manager::{FeroxScans, ScanOrder};

#[test]
/// multiple tests for parse_url_with_raw_path
fn utils_parse_url_with_raw_path() {
// ../.. is preserved
let url = "https://www.google.com/../../stuff";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.as_str(), url);

// ../.. is preserved as well as the trailing slash
let url = "https://www.google.com/../../stuff/";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.as_str(), url);

// no trailing slash is preserved
let url = "https://www.google.com/stuff";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.as_str(), url);

// trailing slash is preserved
let url = "https://www.google.com/stuff/";
let parsed: Url = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.as_str(), url);

// mailto is an error
let url = "mailto:user@example.com";
let parsed = parse_url_with_raw_path(url);
assert!(parsed.is_err());

// relative url is an error
let url = "../../stuff";
let parsed = parse_url_with_raw_path(url);
assert!(parsed.is_err());

// absolute without host is an error
let url = "/../../stuff";
let parsed = parse_url_with_raw_path(url);
assert!(parsed.is_err());

// default ports are parsed correctly
for url in [
"http://example.com:80/path/file.html",
"https://example.com:443/path/file.html",
] {
let parsed = parse_url_with_raw_path(url).unwrap();
assert!(parsed.port().is_none());
assert_eq!(parsed.host().unwrap().to_string().as_str(), "example.com");
}

// non-default ports are parsed correctly
for url in [
"http://example.com:8080/path/file.html",
"https://example.com:4433/path/file.html",
] {
let parsed = parse_url_with_raw_path(url).unwrap();
assert!(parsed.port().is_some());
assert_eq!(parsed.as_str(), url);
}

// different encodings are respected if found in doubles
//
// note that the % sign is encoded as %25...
let url = "http://user:pass@example.com/%2e%2e/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%252e%252e/stuff.php"
);

let url = "http://user:pass@example.com/%25%32%65%25%32%65/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%2525%2532%2565%2525%2532%2565/stuff.php"
);

let url = "http://user:pass@example.com/%c0%ae%c0%ae/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%25c0%25ae%25c0%25ae/stuff.php"
);

let url = "http://user:pass@example.com/%e0%40%ae%e0%40%ae/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%25e0%2540%25ae%25e0%2540%25ae/stuff.php"
);

let url = "http://user:pass@example.com/%c0ae%c0ae/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%25c0ae%25c0ae/stuff.php"
);

let url = "http://user:pass@example.com/%uff0e%uff0e/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%25uff0e%25uff0e/stuff.php"
);

let url = "http://user:pass@example.com/%u002e%u002e/stuff.php";
let parsed = parse_url_with_raw_path(url).unwrap();
assert_eq!(parsed.username(), "user");
assert_eq!(parsed.password().unwrap(), "pass");
assert_eq!(
parsed.as_str(),
"http://user:pass@example.com/%25u002e%25u002e/stuff.php"
);
}

#[test]
/// set_open_file_limit with a low requested limit succeeds
fn utils_set_open_file_limit_with_low_requested_limit() {
Expand Down