Skip to content

Commit 169c17c

Browse files
authored
Merge pull request #2271 from GitoxideLabs/improve-gix-url
`gix-url` improvements
2 parents de2c6fc + e09b3d6 commit 169c17c

File tree

10 files changed

+461
-150
lines changed

10 files changed

+461
-150
lines changed

gix-url/src/lib.rs

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,39 @@ pub struct Url {
113113
/// The URL scheme.
114114
pub scheme: Scheme,
115115
/// The user to impersonate on the remote.
116+
///
117+
/// Stored in decoded form: percent-encoded characters are decoded during parsing.
118+
/// Re-encoded during canonical serialization, but written as-is in alternative form.
116119
pub user: Option<String>,
117120
/// The password associated with a user.
121+
///
122+
/// Stored in decoded form: percent-encoded characters are decoded during parsing.
123+
/// Re-encoded during canonical serialization. Cannot be serialized in alternative form (will panic in debug builds).
118124
pub password: Option<String>,
119125
/// The host to which to connect. Localhost is implied if `None`.
126+
///
127+
/// IPv6 addresses are stored *without* brackets for SSH schemes, but *with* brackets for other schemes.
128+
/// Brackets are automatically added during serialization when needed (e.g., when a port is specified with an IPv6 host).
120129
pub host: Option<String>,
121130
/// When serializing, use the alternative forms as it was parsed as such.
131+
///
132+
/// Alternative forms include SCP-like syntax (`user@host:path`) and bare file paths.
133+
/// When `true`, password and port cannot be serialized (will panic in debug builds).
122134
pub serialize_alternative_form: bool,
123135
/// The port to use when connecting to a host. If `None`, standard ports depending on `scheme` will be used.
124136
pub port: Option<u16>,
125137
/// The path portion of the URL, usually the location of the git repository.
126138
///
139+
/// Unlike `user` and `password`, paths are stored and serialized in their original form
140+
/// without percent-decoding or re-encoding (e.g., `%20` remains `%20`, not converted to space).
141+
///
142+
/// Path normalization during parsing:
143+
/// - SSH/Git schemes: Leading `/~` is stripped (e.g., `/~repo` becomes `~repo`)
144+
/// - SSH/Git schemes: Empty paths are rejected as errors
145+
/// - HTTP/HTTPS schemes: Empty paths are normalized to `/`
146+
///
147+
/// During serialization, SSH/Git URLs prepend `/` to paths not starting with `/`.
148+
///
127149
/// # Security Warning
128150
///
129151
/// URLs allow paths to start with `-` which makes it possible to mask command-line arguments as path which then leads to
@@ -380,6 +402,9 @@ impl Url {
380402

381403
out.write_all(self.scheme.as_str().as_bytes())?;
382404
out.write_all(b"://")?;
405+
406+
let needs_brackets = self.port.is_some() && self.host_needs_brackets();
407+
383408
match (&self.user, &self.host) {
384409
(Some(user), Some(host)) => {
385410
out.write_all(percent_encode(user).as_bytes())?;
@@ -388,10 +413,22 @@ impl Url {
388413
out.write_all(percent_encode(password).as_bytes())?;
389414
}
390415
out.write_all(b"@")?;
416+
if needs_brackets {
417+
out.write_all(b"[")?;
418+
}
391419
out.write_all(host.as_bytes())?;
420+
if needs_brackets {
421+
out.write_all(b"]")?;
422+
}
392423
}
393424
(None, Some(host)) => {
425+
if needs_brackets {
426+
out.write_all(b"[")?;
427+
}
394428
out.write_all(host.as_bytes())?;
429+
if needs_brackets {
430+
out.write_all(b"]")?;
431+
}
395432
}
396433
(None, None) => {}
397434
(Some(_user), None) => {
@@ -403,23 +440,45 @@ impl Url {
403440
if let Some(port) = &self.port {
404441
write!(out, ":{port}")?;
405442
}
443+
// For SSH and Git URLs, add leading '/' if path doesn't start with '/'
444+
// This handles paths like "~repo" which serialize as "/~repo" in URL form
445+
if matches!(self.scheme, Scheme::Ssh | Scheme::Git) && !self.path.starts_with(b"/") {
446+
out.write_all(b"/")?;
447+
}
406448
out.write_all(&self.path)?;
407449
Ok(())
408450
}
409451

452+
fn host_needs_brackets(&self) -> bool {
453+
fn is_ipv6(h: &str) -> bool {
454+
h.contains(':') && !h.starts_with('[')
455+
}
456+
self.host.as_ref().is_some_and(|h| is_ipv6(h))
457+
}
458+
410459
fn write_alternative_form_to(&self, out: &mut dyn std::io::Write) -> std::io::Result<()> {
460+
let needs_brackets = self.host_needs_brackets();
461+
411462
match (&self.user, &self.host) {
412463
(Some(user), Some(host)) => {
413464
out.write_all(user.as_bytes())?;
414-
assert!(
415-
self.password.is_none(),
416-
"BUG: cannot serialize password in alternative form"
417-
);
418465
out.write_all(b"@")?;
466+
if needs_brackets {
467+
out.write_all(b"[")?;
468+
}
419469
out.write_all(host.as_bytes())?;
470+
if needs_brackets {
471+
out.write_all(b"]")?;
472+
}
420473
}
421474
(None, Some(host)) => {
475+
if needs_brackets {
476+
out.write_all(b"[")?;
477+
}
422478
out.write_all(host.as_bytes())?;
479+
if needs_brackets {
480+
out.write_all(b"]")?;
481+
}
423482
}
424483
(None, None) => {}
425484
(Some(_user), None) => {
@@ -428,7 +487,6 @@ impl Url {
428487
));
429488
}
430489
}
431-
assert!(self.port.is_none(), "BUG: cannot serialize port in alternative form");
432490
if self.scheme == Scheme::Ssh {
433491
out.write_all(b":")?;
434492
}

gix-url/src/parse.rs

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,23 @@ pub(crate) fn find_scheme(input: &BStr) -> InputScheme {
7070
return InputScheme::Url { protocol_end };
7171
}
7272

73-
if let Some(colon) = input.find_byte(b':') {
73+
// Find colon, but skip over IPv6 brackets if present
74+
let colon = if input.starts_with(b"[") {
75+
// IPv6 address, find the closing bracket first
76+
if let Some(bracket_end) = input.find_byte(b']') {
77+
// Look for colon after the bracket
78+
input[bracket_end + 1..]
79+
.find_byte(b':')
80+
.map(|pos| bracket_end + 1 + pos)
81+
} else {
82+
// No closing bracket, treat as regular search
83+
input.find_byte(b':')
84+
}
85+
} else {
86+
input.find_byte(b':')
87+
};
88+
89+
if let Some(colon) = colon {
7490
// allow user to select files containing a `:` by passing them as absolute or relative path
7591
// this is behavior explicitly mentioned by the scp and git manuals
7692
let explicitly_local = &input[..colon].contains(&b'/');
@@ -111,20 +127,54 @@ pub(crate) fn url(input: &BStr, protocol_end: usize) -> Result<crate::Url, Error
111127
// Normalize empty path to "/" for http/https URLs only
112128
let path = if url.path.is_empty() && matches!(scheme, Scheme::Http | Scheme::Https) {
113129
"/".into()
130+
} else if matches!(scheme, Scheme::Ssh | Scheme::Git) && url.path.starts_with("/~") {
131+
// For SSH and Git protocols, strip leading '/' from paths starting with '~'
132+
// e.g., "ssh://host/~repo" -> path is "~repo", not "/~repo"
133+
url.path[1..].into()
114134
} else {
115135
url.path.into()
116136
};
117137

138+
let user = url_user(&url, UrlKind::Url)?;
139+
let password = url
140+
.password
141+
.map(|s| percent_decoded_utf8(s, UrlKind::Url))
142+
.transpose()?;
143+
let port = url.port;
144+
145+
// For SSH URLs, strip brackets from IPv6 addresses
146+
let host = if scheme == Scheme::Ssh {
147+
url.host.map(|mut h| {
148+
// Bracketed IPv6 forms
149+
if let Some(h2) = h.strip_prefix('[') {
150+
if let Some(inner) = h2.strip_suffix("]:") {
151+
// "[::1]:" → "::1"
152+
h = inner.to_string();
153+
} else if let Some(inner) = h2.strip_suffix(']') {
154+
// "[::1]" → "::1"
155+
h = inner.to_string();
156+
}
157+
} else {
158+
// Non-bracketed host: strip a single trailing colon
159+
let colon_count = h.chars().filter(|&c| c == ':').take(2).count();
160+
if colon_count == 1 {
161+
if let Some(inner) = h.strip_suffix(':') {
162+
h = inner.to_string();
163+
}
164+
}
165+
}
166+
h
167+
})
168+
} else {
169+
url.host
170+
};
118171
Ok(crate::Url {
119172
serialize_alternative_form: false,
120173
scheme,
121-
user: url_user(&url, UrlKind::Url)?,
122-
password: url
123-
.password
124-
.map(|s| percent_decoded_utf8(s, UrlKind::Url))
125-
.transpose()?,
126-
host: url.host,
127-
port: url.port,
174+
user,
175+
password,
176+
host,
177+
port,
128178
path,
129179
})
130180
}
@@ -166,16 +216,33 @@ pub(crate) fn scp(input: &BStr, colon: usize) -> Result<crate::Url, Error> {
166216
source,
167217
})?;
168218

219+
// For SCP-like SSH URLs, strip leading '/' from paths starting with '/~'
220+
// e.g., "user@host:/~repo" -> path is "~repo", not "/~repo"
221+
let path = if path.starts_with("/~") { &path[1..] } else { path };
222+
223+
let user = url_user(&url, UrlKind::Scp)?;
224+
let password = url
225+
.password
226+
.map(|s| percent_decoded_utf8(s, UrlKind::Scp))
227+
.transpose()?;
228+
let port = url.port;
229+
230+
// For SCP-like SSH URLs, strip brackets from IPv6 addresses
231+
let host = url.host.map(|h| {
232+
if let Some(h) = h.strip_prefix("[").and_then(|h| h.strip_suffix("]")) {
233+
h.to_string()
234+
} else {
235+
h
236+
}
237+
});
238+
169239
Ok(crate::Url {
170240
serialize_alternative_form: true,
171241
scheme: Scheme::from(url.scheme.as_str()),
172-
user: url_user(&url, UrlKind::Scp)?,
173-
password: url
174-
.password
175-
.map(|s| percent_decoded_utf8(s, UrlKind::Scp))
176-
.transpose()?,
177-
host: url.host,
178-
port: url.port,
242+
user,
243+
password,
244+
host,
245+
port,
179246
path: path.into(),
180247
})
181248
}

gix-url/src/simple_url.rs

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,12 @@ impl<'a> ParsedUrl<'a> {
6464
};
6565

6666
// Parse authority: [user[:password]@]host[:port]
67-
let (username, password, host, port) = if let Some(at_pos) = authority.rfind('@') {
67+
let (username, password, host, port) = if let Some((user_info, host_port)) = authority.rsplit_once('@') {
6868
// Has user info
69-
let user_info = &authority[..at_pos];
70-
let host_port = &authority[at_pos + 1..];
71-
72-
let (user, pass) = if let Some(colon_pos) = user_info.find(':') {
73-
let pass_str = &user_info[colon_pos + 1..];
69+
let (user, pass) = if let Some((user, pass_str)) = user_info.split_once(':') {
7470
// Treat empty password as None
7571
let pass = if pass_str.is_empty() { None } else { Some(pass_str) };
76-
(&user_info[..colon_pos], pass)
72+
(user, pass)
7773
} else {
7874
(user_info, None)
7975
};
@@ -115,18 +111,25 @@ impl<'a> ParsedUrl<'a> {
115111
// Handle IPv6 addresses: [::1] or [::1]:port
116112
if host_port.starts_with('[') {
117113
if let Some(bracket_end) = host_port.find(']') {
118-
// IPv6 addresses are case-insensitive, normalize to lowercase
119-
let host = Some(host_port[..=bracket_end].to_ascii_lowercase());
120114
let remaining = &host_port[bracket_end + 1..];
121115

122116
if remaining.is_empty() {
117+
// IPv6 addresses are case-insensitive, normalize to lowercase
118+
let host = Some(host_port[..=bracket_end].to_ascii_lowercase());
123119
return Ok((host, None));
124120
} else if let Some(port_str) = remaining.strip_prefix(':') {
121+
if port_str.is_empty() {
122+
// Empty port like "[::1]:" - preserve the trailing colon for Git compatibility
123+
let host = Some(host_port.to_ascii_lowercase());
124+
return Ok((host, None));
125+
}
125126
let port = port_str.parse::<u16>().map_err(|_| UrlParseError::InvalidPort)?;
126127
// Validate port is in valid range (1-65535, port 0 is invalid)
127128
if port == 0 {
128129
return Err(UrlParseError::InvalidPort);
129130
}
131+
// IPv6 addresses are case-insensitive, normalize to lowercase
132+
let host = Some(host_port[..=bracket_end].to_ascii_lowercase());
130133
return Ok((host, Some(port)));
131134
} else {
132135
return Err(UrlParseError::InvalidDomainCharacter);
@@ -137,27 +140,38 @@ impl<'a> ParsedUrl<'a> {
137140
}
138141

139142
// Handle regular host:port
140-
// Use rfind to handle IPv6 addresses without brackets (edge case)
141-
if let Some(colon_pos) = host_port.rfind(':') {
143+
// Use rfind to find the last colon
144+
if let Some((before_last_colon, after_last_colon)) = host_port.rsplit_once(':') {
142145
// Check if this looks like a port (all digits after colon)
143-
let potential_port = &host_port[colon_pos + 1..];
144-
if potential_port.is_empty() {
145-
// Empty port like "host:" - strip the trailing colon
146-
let host_str = &host_port[..colon_pos];
147-
return Ok((Some(Self::normalize_hostname(host_str)?), None));
148-
} else if potential_port.chars().all(|c| c.is_ascii_digit()) {
149-
let host_str = &host_port[..colon_pos];
150-
let host = Self::normalize_hostname(host_str)?;
151-
let port = potential_port.parse::<u16>().map_err(|_| UrlParseError::InvalidPort)?;
152-
// Validate port is in valid range (1-65535, port 0 is invalid)
153-
if port == 0 {
154-
return Err(UrlParseError::InvalidPort);
146+
// But avoid treating IPv6 addresses as host:port
147+
// IPv6 addresses have colons in the part before the last colon (e.g., "::1" has "::" before the last ":")
148+
let has_colon_before_last = before_last_colon.contains(':');
149+
let is_all_digits_after =
150+
!after_last_colon.is_empty() && after_last_colon.chars().all(|c| c.is_ascii_digit());
151+
152+
// Treat as port separator only if:
153+
// 1. There's no colon before the last colon (normal host:port)
154+
// 2. OR it's explicitly empty (host: with trailing colon)
155+
if !has_colon_before_last {
156+
if after_last_colon.is_empty() {
157+
// Empty port like "host:" - store host with trailing colon
158+
// This is needed for Git compatibility where "host:" != "host"
159+
return Ok((Some(Self::normalize_hostname(host_port)?), None));
160+
} else if is_all_digits_after {
161+
let host = Self::normalize_hostname(before_last_colon)?;
162+
let port = after_last_colon
163+
.parse::<u16>()
164+
.map_err(|_| UrlParseError::InvalidPort)?;
165+
// Validate port is in valid range (1-65535, port 0 is invalid)
166+
if port == 0 {
167+
return Err(UrlParseError::InvalidPort);
168+
}
169+
return Ok((Some(host), Some(port)));
155170
}
156-
return Ok((Some(host), Some(port)));
157171
}
158172
}
159173

160-
// No port, just host
174+
// No port, just host (including bare IPv6 addresses)
161175
Ok((Some(Self::normalize_hostname(host_port)?), None))
162176
}
163177

0 commit comments

Comments
 (0)