Skip to content

Commit

Permalink
implement version-sorting algorithm for rust identifiers
Browse files Browse the repository at this point in the history
The algorithm is described in the [style guide] and was introduced in
`r-l/rust 115046`.

[style guide]: https://doc.rust-lang.org/nightly/style-guide/#sorting
  • Loading branch information
ytmimi committed Aug 16, 2024
1 parent fbe0424 commit 902d6fc
Show file tree
Hide file tree
Showing 2 changed files with 367 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ mod rewrite;
pub(crate) mod rustfmt_diff;
mod shape;
mod skip;
mod sort;
pub(crate) mod source_file;
pub(crate) mod source_map;
mod spanned;
Expand Down
366 changes: 366 additions & 0 deletions src/sort.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
use itertools::EitherOrBoth;
use itertools::Itertools;

/// Iterator which breaks an identifier into various [VersionChunk]s.
struct VersionChunkIter<'a> {
ident: &'a str,
start: usize,
}

impl<'a> VersionChunkIter<'a> {
pub(crate) fn new(ident: &'a str) -> Self {
Self { ident, start: 0 }
}

fn parse_numeric_chunk(
&mut self,
mut chars: std::str::CharIndices<'a>,
) -> Option<VersionChunk<'a>> {
let mut end = self.start;
let mut is_end_of_chunk = false;

while let Some((idx, c)) = chars.next() {
end = self.start + idx;

if c.is_ascii_digit() {
continue;
}

is_end_of_chunk = true;
break;
}

let source = if is_end_of_chunk {
let value = &self.ident[self.start..end];
self.start = end;
value
} else {
let value = &self.ident[self.start..];
self.start = self.ident.len();
value
};

let zeros = source.chars().take_while(|c| *c == '0').count();
let value = source.parse::<usize>().ok()?;

Some(VersionChunk::Number {
value,
zeros,
source,
})
}

fn parse_str_chunk(
&mut self,
mut chars: std::str::CharIndices<'a>,
) -> Option<VersionChunk<'a>> {
let mut end = self.start;
let mut is_end_of_chunk = false;

while let Some((idx, c)) = chars.next() {
end = self.start + idx;

if c == '_' {
is_end_of_chunk = true;
break;
}

if !c.is_numeric() {
continue;
}

is_end_of_chunk = true;
break;
}

let source = if is_end_of_chunk {
let value = &self.ident[self.start..end];
self.start = end;
value
} else {
let value = &self.ident[self.start..];
self.start = self.ident.len();
value
};

Some(VersionChunk::Str(source))
}
}

impl<'a> Iterator for VersionChunkIter<'a> {
type Item = VersionChunk<'a>;

fn next(&mut self) -> Option<Self::Item> {
let mut chars = self.ident[self.start..].char_indices();
let (_, next) = chars.next()?;

if next == '_' {
self.start = self.start + next.len_utf8();
return Some(VersionChunk::Underscore);
}

if next.is_ascii_digit() {
return self.parse_numeric_chunk(chars);
}

self.parse_str_chunk(chars)
}
}

/// Represents a chunk in the version-sort algorithm
#[derive(Debug, PartialEq, Eq)]
enum VersionChunk<'a> {
/// A singel `_` in an identifier. Underscores are sorted before all other characters.
Underscore,
/// A &str chunk in the version sort.
Str(&'a str),
/// A numeric chunk in the version sort. Keeps track of the numeric value and leading zeros.
Number {
value: usize,
zeros: usize,
source: &'a str,
},
}

/// Determine which side of the version-sort comparison had more leading zeros.
#[derive(Debug, PartialEq, Eq)]
enum MoreLeadingZeros {
Left,
Right,
Equal,
}

/// Compare two identifiers based on the version sorting algorithm described in [the style guide]
///
/// [the style guide]: https://doc.rust-lang.org/nightly/style-guide/#sorting
pub(crate) fn version_sort(a: &str, b: &str) -> std::cmp::Ordering {
let iter_a = VersionChunkIter::new(a);
let iter_b = VersionChunkIter::new(b);
let mut more_leading_zeros = MoreLeadingZeros::Equal;

for either_or_both in iter_a.zip_longest(iter_b) {
match either_or_both {
EitherOrBoth::Left(_) => return std::cmp::Ordering::Greater,
EitherOrBoth::Right(_) => return std::cmp::Ordering::Less,
EitherOrBoth::Both(a, b) => match (a, b) {
(VersionChunk::Underscore, VersionChunk::Underscore) => {
continue;
}
(VersionChunk::Underscore, _) => return std::cmp::Ordering::Less,
(_, VersionChunk::Underscore) => return std::cmp::Ordering::Greater,
(VersionChunk::Str(ca), VersionChunk::Str(cb))
| (VersionChunk::Str(ca), VersionChunk::Number { source: cb, .. })
| (VersionChunk::Number { source: ca, .. }, VersionChunk::Str(cb)) => {
match ca.cmp(&cb) {
std::cmp::Ordering::Equal => {
continue;
}
order @ _ => return order,
}
}
(
VersionChunk::Number {
value: va,
zeros: lza,
..
},
VersionChunk::Number {
value: vb,
zeros: lzb,
..
},
) => match va.cmp(&vb) {
std::cmp::Ordering::Equal => {
if lza == lzb {
continue;
}

if more_leading_zeros == MoreLeadingZeros::Equal && lza > lzb {
more_leading_zeros = MoreLeadingZeros::Left;
} else if more_leading_zeros == MoreLeadingZeros::Equal && lza < lzb {
more_leading_zeros = MoreLeadingZeros::Right;
}
continue;
}
order @ _ => return order,
},
},
}
}

match more_leading_zeros {
MoreLeadingZeros::Equal => std::cmp::Ordering::Equal,
MoreLeadingZeros::Left => std::cmp::Ordering::Less,
MoreLeadingZeros::Right => std::cmp::Ordering::Greater,
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_chunks() {
let mut iter = VersionChunkIter::new("x86_128");
assert_eq!(iter.next(), Some(VersionChunk::Str("x")));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 86,
zeros: 0,
source: "86"
})
);
assert_eq!(iter.next(), Some(VersionChunk::Underscore));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 128,
zeros: 0,
source: "128"
})
);
assert_eq!(iter.next(), None);

let mut iter = VersionChunkIter::new("w005s09t");
assert_eq!(iter.next(), Some(VersionChunk::Str("w")));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 5,
zeros: 2,
source: "005"
})
);
assert_eq!(iter.next(), Some(VersionChunk::Str("s")));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 9,
zeros: 1,
source: "09"
})
);
assert_eq!(iter.next(), Some(VersionChunk::Str("t")));
assert_eq!(iter.next(), None);

let mut iter = VersionChunkIter::new("ZY_WX");
assert_eq!(iter.next(), Some(VersionChunk::Str("ZY")));
assert_eq!(iter.next(), Some(VersionChunk::Underscore));
assert_eq!(iter.next(), Some(VersionChunk::Str("WX")));

let mut iter = VersionChunkIter::new("_v1");
assert_eq!(iter.next(), Some(VersionChunk::Underscore));
assert_eq!(iter.next(), Some(VersionChunk::Str("v")));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 1,
zeros: 0,
source: "1"
})
);

let mut iter = VersionChunkIter::new("_1v");
assert_eq!(iter.next(), Some(VersionChunk::Underscore));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 1,
zeros: 0,
source: "1"
})
);
assert_eq!(iter.next(), Some(VersionChunk::Str("v")));

let mut iter = VersionChunkIter::new("v009");
assert_eq!(iter.next(), Some(VersionChunk::Str("v")));
assert_eq!(
iter.next(),
Some(VersionChunk::Number {
value: 9,
zeros: 2,
source: "009"
})
);
}

#[test]
fn test_version_sort() {
let mut input = vec!["", "b", "a"];
let expected = vec!["", "a", "b"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["x7x", "xxx"];
let expected = vec!["x7x", "xxx"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["applesauce", "apple"];
let expected = vec!["apple", "applesauce"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["aaaaa", "aaa_a"];
let expected = vec!["aaa_a", "aaaaa"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["AAAAA", "AAA1A", "BBBBB", "BB_BB", "C3CCC"];
let expected = vec!["AAA1A", "AAAAA", "BB_BB", "BBBBB", "C3CCC"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["1_000_000", "1_010_001"];
let expected = vec!["1_000_000", "1_010_001"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec![
"5", "50", "500", "5_000", "5_005", "5_050", "5_500", "50_000", "50_005", "50_050",
"50_500",
];
let expected = vec![
"5", "5_000", "5_005", "5_050", "5_500", "50", "50_000", "50_005", "50_050", "50_500",
"500",
];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["X86_64", "x86_64", "X86_128", "x86_128"];
let expected = vec!["X86_64", "X86_128", "x86_64", "x86_128"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["__", "_"];
let expected = vec!["_", "__"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["foo_", "foo"];
let expected = vec!["foo", "foo_"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec!["A", "AA", "B", "a", "aA", "aa", "b"];
let expected = vec!["A", "AA", "B", "a", "aA", "aa", "b"];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected);

let mut input = vec![
"x87", "Z_YWX", "ZY_WX", "ZYW_X", "ZYWX", "ZYWX_", "v010", "v10", "w005s09t",
"w5s009t", "x64", "x86", "x86_32", "x86_64", "x86_128", "v000", "v00", "v0", "v0s",
"v00t", "v0u", "v001", "v01", "v1", "v009", "v09", "v9", "_ZYWX", "u_zzz", "u8", "u16",
"u32", "u64", "u128", "u256", "ua", "usize", "uz",
];
let expected = vec![
"_ZYWX", "Z_YWX", "ZY_WX", "ZYW_X", "ZYWX", "ZYWX_", "u_zzz", "u8", "u16", "u32",
"u64", "u128", "u256", "ua", "usize", "uz", "v000", "v00", "v0", "v0s", "v00t", "v0u",
"v001", "v01", "v1", "v009", "v09", "v9", "v010", "v10", "w005s09t", "w5s009t", "x64",
"x86", "x86_32", "x86_64", "x86_128", "x87",
];
input.sort_by(|a, b| version_sort(a, b));
assert_eq!(input, expected)
}
}

0 comments on commit 902d6fc

Please sign in to comment.