Skip to content

Commit f1e7acb

Browse files
committed
Extract edit distance to its own file
1 parent b245b84 commit f1e7acb

File tree

2 files changed

+240
-255
lines changed

2 files changed

+240
-255
lines changed

src/tools/compiletest/src/util.rs

+1-255
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
use crate::common::Config;
2-
use std::cmp;
32
use std::env;
43
use std::ffi::OsStr;
5-
use std::mem;
64
use std::path::PathBuf;
75
use std::process::Command;
86

97
use tracing::*;
108

119
#[cfg(test)]
1210
mod tests;
11+
pub(crate) mod edit_distance;
1312

1413
pub fn make_new_path(path: &str) -> String {
1514
assert!(cfg!(windows));
@@ -78,256 +77,3 @@ pub fn add_dylib_path(cmd: &mut Command, paths: impl Iterator<Item = impl Into<P
7877
let new_paths = paths.map(Into::into).chain(old_paths.into_iter().flatten());
7978
cmd.env(dylib_env_var(), env::join_paths(new_paths).unwrap());
8079
}
81-
82-
// Edit distance taken from rustc's `rustc_span::edit_distance`.
83-
84-
/// Finds the [edit distance] between two strings.
85-
///
86-
/// Returns `None` if the distance exceeds the limit.
87-
///
88-
/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
89-
pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
90-
let mut a = &a.chars().collect::<Vec<_>>()[..];
91-
let mut b = &b.chars().collect::<Vec<_>>()[..];
92-
93-
// Ensure that `b` is the shorter string, minimizing memory use.
94-
if a.len() < b.len() {
95-
mem::swap(&mut a, &mut b);
96-
}
97-
98-
let min_dist = a.len() - b.len();
99-
// If we know the limit will be exceeded, we can return early.
100-
if min_dist > limit {
101-
return None;
102-
}
103-
104-
// Strip common prefix.
105-
loop {
106-
let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first())
107-
else {
108-
break;
109-
};
110-
111-
if a_char != b_char {
112-
break;
113-
}
114-
115-
a = a_rest;
116-
b = b_rest;
117-
}
118-
119-
// Strip common suffix.
120-
loop {
121-
let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) else {
122-
break;
123-
};
124-
125-
if a_char != b_char {
126-
break;
127-
}
128-
129-
a = a_rest;
130-
b = b_rest;
131-
}
132-
133-
// If either string is empty, the distance is the length of the other.
134-
// We know that `b` is the shorter string, so we don't need to check `a`.
135-
if b.len() == 0 {
136-
return Some(min_dist);
137-
}
138-
139-
let mut prev_prev = vec![usize::MAX; b.len() + 1];
140-
let mut prev = (0..=b.len()).collect::<Vec<_>>();
141-
let mut current = vec![0; b.len() + 1];
142-
143-
// row by row
144-
for i in 1..=a.len() {
145-
current[0] = i;
146-
let a_idx = i - 1;
147-
148-
// column by column
149-
for j in 1..=b.len() {
150-
let b_idx = j - 1;
151-
152-
// There is no cost to substitute a character with itself.
153-
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };
154-
155-
current[j] = cmp::min(
156-
// deletion
157-
prev[j] + 1,
158-
cmp::min(
159-
// insertion
160-
current[j - 1] + 1,
161-
// substitution
162-
prev[j - 1] + substitution_cost,
163-
),
164-
);
165-
166-
if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
167-
// transposition
168-
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
169-
}
170-
}
171-
172-
// Rotate the buffers, reusing the memory.
173-
[prev_prev, prev, current] = [prev, current, prev_prev];
174-
}
175-
176-
// `prev` because we already rotated the buffers.
177-
let distance = prev[b.len()];
178-
(distance <= limit).then_some(distance)
179-
}
180-
181-
/// Provides a word similarity score between two words that accounts for substrings being more
182-
/// meaningful than a typical edit distance. The lower the score, the closer the match. 0 is an
183-
/// identical match.
184-
///
185-
/// Uses the edit distance between the two strings and removes the cost of the length difference.
186-
/// If this is 0 then it is either a substring match or a full word match, in the substring match
187-
/// case we detect this and return `1`. To prevent finding meaningless substrings, eg. "in" in
188-
/// "shrink", we only perform this subtraction of length difference if one of the words is not
189-
/// greater than twice the length of the other. For cases where the words are close in size but not
190-
/// an exact substring then the cost of the length difference is discounted by half.
191-
///
192-
/// Returns `None` if the distance exceeds the limit.
193-
pub fn edit_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
194-
let n = a.chars().count();
195-
let m = b.chars().count();
196-
197-
// Check one isn't less than half the length of the other. If this is true then there is a
198-
// big difference in length.
199-
let big_len_diff = (n * 2) < m || (m * 2) < n;
200-
let len_diff = if n < m { m - n } else { n - m };
201-
let distance = edit_distance(a, b, limit + len_diff)?;
202-
203-
// This is the crux, subtracting length difference means exact substring matches will now be 0
204-
let score = distance - len_diff;
205-
206-
// If the score is 0 but the words have different lengths then it's a substring match not a full
207-
// word match
208-
let score = if score == 0 && len_diff > 0 && !big_len_diff {
209-
1 // Exact substring match, but not a total word match so return non-zero
210-
} else if !big_len_diff {
211-
// Not a big difference in length, discount cost of length difference
212-
score + (len_diff + 1) / 2
213-
} else {
214-
// A big difference in length, add back the difference in length to the score
215-
score + len_diff
216-
};
217-
218-
(score <= limit).then_some(score)
219-
}
220-
221-
/// Finds the best match for given word in the given iterator where substrings are meaningful.
222-
///
223-
/// A version of [`find_best_match_for_name`] that uses [`edit_distance_with_substrings`] as the
224-
/// score for word similarity. This takes an optional distance limit which defaults to one-third of
225-
/// the given word.
226-
///
227-
/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
228-
/// letters mismatch.
229-
pub fn find_best_match_for_name_with_substrings<'c, 'd, 'l>(
230-
candidates: &'c [&'d str],
231-
lookup: &'l str,
232-
dist: Option<usize>,
233-
) -> Option<&'d str> {
234-
find_best_match_for_name_impl(true, candidates, lookup, dist)
235-
}
236-
237-
/// Finds the best match for a given word in the given iterator.
238-
///
239-
/// As a loose rule to avoid the obviously incorrect suggestions, it takes
240-
/// an optional limit for the maximum allowable edit distance, which defaults
241-
/// to one-third of the given word.
242-
///
243-
/// We use case insensitive comparison to improve accuracy on an edge case with a lower(upper)case
244-
/// letters mismatch.
245-
pub fn find_best_match_for_name<'c, 'd, 'l>(
246-
candidates: &'c [&'d str],
247-
lookup: &'l str,
248-
dist: Option<usize>,
249-
) -> Option<&'d str> {
250-
find_best_match_for_name_impl(false, candidates, lookup, dist)
251-
}
252-
253-
#[cold]
254-
fn find_best_match_for_name_impl<'c, 'd, 'l>(
255-
use_substring_score: bool,
256-
candidates: &'c [&'d str],
257-
lookup: &'l str,
258-
dist: Option<usize>,
259-
) -> Option<&'d str> {
260-
let lookup_uppercase = lookup.to_uppercase();
261-
262-
// Priority of matches:
263-
// 1. Exact case insensitive match
264-
// 2. Edit distance match
265-
// 3. Sorted word match
266-
if let Some(c) = candidates.iter().find(|c| c.to_uppercase() == lookup_uppercase) {
267-
return Some(*c);
268-
}
269-
270-
// `fn edit_distance()` use `chars()` to calculate edit distance, so we must
271-
// also use `chars()` (and not `str::len()`) to calculate length here.
272-
let lookup_len = lookup.chars().count();
273-
274-
let mut dist = dist.unwrap_or_else(|| cmp::max(lookup_len, 3) / 3);
275-
let mut best = None;
276-
// store the candidates with the same distance, only for `use_substring_score` current.
277-
let mut next_candidates = vec![];
278-
for c in candidates {
279-
match if use_substring_score {
280-
edit_distance_with_substrings(lookup, c, dist)
281-
} else {
282-
edit_distance(lookup, c, dist)
283-
} {
284-
Some(0) => return Some(*c),
285-
Some(d) => {
286-
if use_substring_score {
287-
if d < dist {
288-
dist = d;
289-
next_candidates.clear();
290-
} else {
291-
// `d == dist` here, we need to store the candidates with the same distance
292-
// so we won't decrease the distance in the next loop.
293-
}
294-
next_candidates.push(*c);
295-
} else {
296-
dist = d - 1;
297-
}
298-
best = Some(*c);
299-
}
300-
None => {}
301-
}
302-
}
303-
304-
// We have a tie among several candidates, try to select the best among them ignoring substrings.
305-
// For example, the candidates list `force_capture`, `capture`, and user inputted `forced_capture`,
306-
// we select `force_capture` with a extra round of edit distance calculation.
307-
if next_candidates.len() > 1 {
308-
debug_assert!(use_substring_score);
309-
best = find_best_match_for_name_impl(false, &next_candidates, lookup, Some(lookup.len()));
310-
}
311-
if best.is_some() {
312-
return best;
313-
}
314-
315-
find_match_by_sorted_words(candidates, lookup)
316-
}
317-
318-
fn find_match_by_sorted_words<'c, 'd, 'l>(
319-
iter_names: &'c [&'d str],
320-
lookup: &'l str,
321-
) -> Option<&'d str> {
322-
let lookup_sorted_by_words = sort_by_words(lookup);
323-
iter_names.iter().fold(None, |result, candidate| {
324-
if sort_by_words(candidate) == lookup_sorted_by_words { Some(*candidate) } else { result }
325-
})
326-
}
327-
328-
fn sort_by_words(name: &str) -> Vec<&str> {
329-
let mut split_words: Vec<&str> = name.split('_').collect();
330-
// We are sorting primitive &strs and can use unstable sort here.
331-
split_words.sort_unstable();
332-
split_words
333-
}

0 commit comments

Comments
 (0)