Skip to content

Commit fb16585

Browse files
authored
Merge pull request #7567 from MoSal/faster_sort_n
sort: immediately compare whole lines if they parse as numbers
2 parents e4fa68c + 410da77 commit fb16585

File tree

3 files changed

+47
-2
lines changed

3 files changed

+47
-2
lines changed

src/uu/sort/BENCHMARKING.md

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,19 @@ Run `cargo build --release` before benchmarking after you make a change!
2424

2525
## Sorting numbers
2626

27-
- Generate a list of numbers: `seq 0 100000 | sort -R > shuffled_numbers.txt`.
28-
- Benchmark numeric sorting with hyperfine: `hyperfine "target/release/coreutils sort shuffled_numbers.txt -n -o output.txt"`.
27+
- Generate a list of numbers:
28+
```
29+
shuf -i 1-1000000 -n 1000000 > shuffled_numbers.txt
30+
# or
31+
seq 1 1000000 | sort -R > shuffled_numbers.txt
32+
```
33+
- Benchmark numeric sorting with hyperfine
34+
```
35+
hyperfine --warmup 3 \
36+
'/tmp/gnu-sort -n /tmp/shuffled_numbers.txt'
37+
'/tmp/uu_before sort -n /tmp/shuffled_numbers.txt'
38+
'/tmp/uu_after sort -n /tmp/shuffled_numbers.txt'
39+
```
2940

3041
## Sorting numbers with -g
3142

src/uu/sort/src/chunks.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ pub struct LineData<'a> {
4242
pub selections: Vec<&'a str>,
4343
pub num_infos: Vec<NumInfo>,
4444
pub parsed_floats: Vec<GeneralF64ParseResult>,
45+
pub line_num_floats: Vec<Option<f64>>,
4546
}
4647

4748
impl Chunk {
@@ -52,6 +53,7 @@ impl Chunk {
5253
contents.line_data.selections.clear();
5354
contents.line_data.num_infos.clear();
5455
contents.line_data.parsed_floats.clear();
56+
contents.line_data.line_num_floats.clear();
5557
let lines = unsafe {
5658
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
5759
// because the vector is empty.
@@ -73,13 +75,15 @@ impl Chunk {
7375
selections,
7476
std::mem::take(&mut contents.line_data.num_infos),
7577
std::mem::take(&mut contents.line_data.parsed_floats),
78+
std::mem::take(&mut contents.line_data.line_num_floats),
7679
)
7780
});
7881
RecycledChunk {
7982
lines: recycled_contents.0,
8083
selections: recycled_contents.1,
8184
num_infos: recycled_contents.2,
8285
parsed_floats: recycled_contents.3,
86+
line_num_floats: recycled_contents.4,
8387
buffer: self.into_owner(),
8488
}
8589
}
@@ -97,6 +101,7 @@ pub struct RecycledChunk {
97101
selections: Vec<&'static str>,
98102
num_infos: Vec<NumInfo>,
99103
parsed_floats: Vec<GeneralF64ParseResult>,
104+
line_num_floats: Vec<Option<f64>>,
100105
buffer: Vec<u8>,
101106
}
102107

@@ -107,6 +112,7 @@ impl RecycledChunk {
107112
selections: Vec::new(),
108113
num_infos: Vec::new(),
109114
parsed_floats: Vec::new(),
115+
line_num_floats: Vec::new(),
110116
buffer: vec![0; capacity],
111117
}
112118
}
@@ -149,6 +155,7 @@ pub fn read<T: Read>(
149155
selections,
150156
num_infos,
151157
parsed_floats,
158+
line_num_floats,
152159
mut buffer,
153160
} = recycled_chunk;
154161
if buffer.len() < carry_over.len() {
@@ -184,6 +191,7 @@ pub fn read<T: Read>(
184191
selections,
185192
num_infos,
186193
parsed_floats,
194+
line_num_floats,
187195
};
188196
parse_lines(read, &mut lines, &mut line_data, separator, settings);
189197
Ok(ChunkContents { lines, line_data })
@@ -207,6 +215,7 @@ fn parse_lines<'a>(
207215
assert!(line_data.selections.is_empty());
208216
assert!(line_data.num_infos.is_empty());
209217
assert!(line_data.parsed_floats.is_empty());
218+
assert!(line_data.line_num_floats.is_empty());
210219
let mut token_buffer = vec![];
211220
lines.extend(
212221
read.split(separator as char)

src/uu/sort/src/sort.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,13 @@ impl<'a> Line<'a> {
460460
if settings.precomputed.needs_tokens {
461461
tokenize(line, settings.separator, token_buffer);
462462
}
463+
if settings.mode == SortMode::Numeric {
464+
// exclude inf, nan, scientific notation
465+
let line_num_float = (!line.contains(char::is_alphabetic))
466+
.then(|| line.parse::<f64>().ok())
467+
.flatten();
468+
line_data.line_num_floats.push(line_num_float);
469+
}
463470
for (selector, selection) in settings
464471
.selectors
465472
.iter()
@@ -1563,6 +1570,24 @@ fn compare_by<'a>(
15631570
let mut selection_index = 0;
15641571
let mut num_info_index = 0;
15651572
let mut parsed_float_index = 0;
1573+
1574+
if let (Some(Some(a_f64)), Some(Some(b_f64))) = (
1575+
a_line_data.line_num_floats.get(a.index),
1576+
b_line_data.line_num_floats.get(b.index),
1577+
) {
1578+
// we don't use total_cmp() because it always sorts -0 before 0
1579+
if let Some(cmp) = a_f64.partial_cmp(b_f64) {
1580+
// don't trust `Ordering::Equal` if lines are not fully equal
1581+
if cmp != Ordering::Equal || a.line == b.line {
1582+
return if global_settings.reverse {
1583+
cmp.reverse()
1584+
} else {
1585+
cmp
1586+
};
1587+
}
1588+
}
1589+
}
1590+
15661591
for selector in &global_settings.selectors {
15671592
let (a_str, b_str) = if selector.needs_selection {
15681593
let selections = (

0 commit comments

Comments
 (0)