Skip to content

Use restricted Damerau-Levenshtein algorithm #11963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/cargo/core/package_id_spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ use serde::{de, ser};
use url::Url;

use crate::core::PackageId;
use crate::util::edit_distance;
use crate::util::errors::CargoResult;
use crate::util::interning::InternedString;
use crate::util::lev_distance;
use crate::util::{validate_package_name, IntoUrl, ToSemver};

/// Some or all of the data required to identify a package:
Expand Down Expand Up @@ -88,7 +88,7 @@ impl PackageIdSpec {
{
let i: Vec<_> = i.into_iter().collect();
let spec = PackageIdSpec::parse(spec).with_context(|| {
let suggestion = lev_distance::closest_msg(spec, i.iter(), |id| id.name().as_str());
let suggestion = edit_distance::closest_msg(spec, i.iter(), |id| id.name().as_str());
format!("invalid package ID specification: `{}`{}", spec, suggestion)
})?;
spec.query(i)
Expand Down Expand Up @@ -229,7 +229,7 @@ impl PackageIdSpec {
);
}
if suggestion.is_empty() {
suggestion.push_str(&lev_distance::closest_msg(
suggestion.push_str(&edit_distance::closest_msg(
&self.name,
all_ids.iter(),
|id| id.name().as_str(),
Expand Down
5 changes: 2 additions & 3 deletions src/cargo/core/resolver/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::fmt;
use std::task::Poll;

use crate::core::{Dependency, PackageId, QueryKind, Registry, Summary};
use crate::util::lev_distance::lev_distance;
use crate::util::edit_distance::edit_distance;
use crate::util::{Config, VersionExt};
use anyhow::Error;

Expand Down Expand Up @@ -308,8 +308,7 @@ pub(super) fn activation_error(
candidates.dedup_by(|a, b| a.name() == b.name());
let mut candidates: Vec<_> = candidates
.iter()
.map(|n| (lev_distance(&*new_dep.package_name(), &*n.name()), n))
.filter(|&(d, _)| d < 4)
.filter_map(|n| Some((edit_distance(&*new_dep.package_name(), &*n.name(), 3)?, n)))
.collect();
candidates.sort_by_key(|o| o.0);
let mut msg: String;
Expand Down
23 changes: 12 additions & 11 deletions src/cargo/core/workspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ use crate::core::{Dependency, FeatureValue, PackageId, PackageIdSpec};
use crate::core::{EitherManifest, Package, SourceId, VirtualManifest};
use crate::ops;
use crate::sources::{PathSource, CRATES_IO_INDEX, CRATES_IO_REGISTRY};
use crate::util::edit_distance;
use crate::util::errors::{CargoResult, ManifestError};
use crate::util::interning::InternedString;
use crate::util::lev_distance;
use crate::util::toml::{read_manifest, InheritableFields, TomlDependency, TomlProfiles};
use crate::util::{config::ConfigRelativePath, Config, Filesystem, IntoUrl};
use cargo_util::paths;
Expand Down Expand Up @@ -1245,8 +1245,9 @@ impl<'cfg> Workspace<'cfg> {
optional_dependency_names_per_member.insert(member, optional_dependency_names_raw);
}

let levenshtein_test =
|a: InternedString, b: InternedString| lev_distance(a.as_str(), b.as_str()) < 4;
let edit_distance_test = |a: InternedString, b: InternedString| {
edit_distance(a.as_str(), b.as_str(), 3).is_some()
};

let suggestions: Vec<_> = cli_features
.features
Expand All @@ -1257,12 +1258,12 @@ impl<'cfg> Workspace<'cfg> {
// Finds member features which are similar to the requested feature.
let summary_features = summary_features
.iter()
.filter(move |feature| levenshtein_test(**feature, *typo));
.filter(move |feature| edit_distance_test(**feature, *typo));

// Finds optional dependencies which name is similar to the feature
let optional_dependency_features = optional_dependency_names
.iter()
.filter(move |feature| levenshtein_test(**feature, *typo));
.filter(move |feature| edit_distance_test(**feature, *typo));

summary_features
.chain(optional_dependency_features)
Expand All @@ -1278,13 +1279,13 @@ impl<'cfg> Workspace<'cfg> {
// Finds set of `pkg/feat` that are very similar to current `pkg/feat`.
let pkg_feat_similar = dependencies_features
.iter()
.filter(|(name, _)| levenshtein_test(**name, *dep_name))
.filter(|(name, _)| edit_distance_test(**name, *dep_name))
.map(|(name, features)| {
(
name,
features
.iter()
.filter(|feature| levenshtein_test(**feature, *dep_feature))
.filter(|feature| edit_distance_test(**feature, *dep_feature))
.collect::<Vec<_>>(),
)
})
Expand All @@ -1298,12 +1299,12 @@ impl<'cfg> Workspace<'cfg> {
// Finds set of `member/optional_dep` features which name is similar to current `pkg/feat`.
let optional_dependency_features = optional_dependency_names_per_member
.iter()
.filter(|(package, _)| levenshtein_test(package.name(), *dep_name))
.filter(|(package, _)| edit_distance_test(package.name(), *dep_name))
.map(|(package, optional_dependencies)| {
optional_dependencies
.into_iter()
.filter(|optional_dependency| {
levenshtein_test(**optional_dependency, *dep_name)
edit_distance_test(**optional_dependency, *dep_name)
})
.map(move |optional_dependency| {
format!("{}/{}", package.name(), optional_dependency)
Expand All @@ -1314,12 +1315,12 @@ impl<'cfg> Workspace<'cfg> {
// Finds set of `member/feat` features which name is similar to current `pkg/feat`.
let summary_features = summary_features_per_member
.iter()
.filter(|(package, _)| levenshtein_test(package.name(), *dep_name))
.filter(|(package, _)| edit_distance_test(package.name(), *dep_name))
.map(|(package, summary_features)| {
summary_features
.into_iter()
.filter(|summary_feature| {
levenshtein_test(**summary_feature, *dep_feature)
edit_distance_test(**summary_feature, *dep_feature)
})
.map(move |summary_feature| {
format!("{}/{}", package.name(), summary_feature)
Expand Down
4 changes: 2 additions & 2 deletions src/cargo/ops/cargo_clean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use crate::core::compiler::{CompileKind, CompileMode, Layout, RustcTargetData};
use crate::core::profiles::Profiles;
use crate::core::{PackageIdSpec, TargetKind, Workspace};
use crate::ops;
use crate::util::edit_distance;
use crate::util::errors::CargoResult;
use crate::util::interning::InternedString;
use crate::util::lev_distance;
use crate::util::{Config, Progress, ProgressStyle};

use anyhow::Context as _;
Expand Down Expand Up @@ -118,7 +118,7 @@ pub fn clean(ws: &Workspace<'_>, opts: &CleanOptions<'_>) -> CargoResult<()> {
let matches: Vec<_> = resolve.iter().filter(|id| spec.matches(*id)).collect();
if matches.is_empty() {
let mut suggestion = String::new();
suggestion.push_str(&lev_distance::closest_msg(
suggestion.push_str(&edit_distance::closest_msg(
&spec.name(),
resolve.iter(),
|id| id.name().as_str(),
Expand Down
143 changes: 143 additions & 0 deletions src/cargo/util/edit_distance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
use std::{cmp, mem};

/// Finds the [edit distance] between two strings.
///
/// Returns `None` if the distance exceeds the limit.
///
/// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
pub fn edit_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
// Comparing the strings lowercased will result in a difference in capitalization being less distance away
// than being a completely different letter. Otherwise `CHECK` is as far away from `check` as it
// is from `build` (both with a distance of 5). For a single letter shortcut (e.g. `b` or `c`), they will
// all be as far away from any capital single letter entry (all with a distance of 1).
// By first lowercasing the strings, `C` and `c` are closer than `C` and `b`, for example.
let a = a.to_lowercase();
let b = b.to_lowercase();

let mut a = &a.chars().collect::<Vec<_>>()[..];
let mut b = &b.chars().collect::<Vec<_>>()[..];

// Ensure that `b` is the shorter string, minimizing memory use.
if a.len() < b.len() {
mem::swap(&mut a, &mut b);
}

let min_dist = a.len() - b.len();
// If we know the limit will be exceeded, we can return early.
if min_dist > limit {
return None;
}

// Strip common prefix.
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_first().zip(a.split_first()) {
if a_char != b_char {
break;
}
a = a_rest;
b = b_rest;
}
// Strip common suffix.
while let Some(((b_char, b_rest), (a_char, a_rest))) = b.split_last().zip(a.split_last()) {
if a_char != b_char {
break;
}
a = a_rest;
b = b_rest;
}

// If either string is empty, the distance is the length of the other.
// We know that `b` is the shorter string, so we don't need to check `a`.
if b.len() == 0 {
return Some(min_dist);
}

let mut prev_prev = vec![usize::MAX; b.len() + 1];
let mut prev = (0..=b.len()).collect::<Vec<_>>();
let mut current = vec![0; b.len() + 1];

// row by row
for i in 1..=a.len() {
current[0] = i;
let a_idx = i - 1;

// column by column
for j in 1..=b.len() {
let b_idx = j - 1;

// There is no cost to substitute a character with itself.
let substitution_cost = if a[a_idx] == b[b_idx] { 0 } else { 1 };

current[j] = cmp::min(
// deletion
prev[j] + 1,
cmp::min(
// insertion
current[j - 1] + 1,
// substitution
prev[j - 1] + substitution_cost,
),
);

if (i > 1) && (j > 1) && (a[a_idx] == b[b_idx - 1]) && (a[a_idx - 1] == b[b_idx]) {
// transposition
current[j] = cmp::min(current[j], prev_prev[j - 2] + 1);
}
}

// Rotate the buffers, reusing the memory.
[prev_prev, prev, current] = [prev, current, prev_prev];
}

// `prev` because we already rotated the buffers.
let distance = prev[b.len()];
(distance <= limit).then_some(distance)
}

/// Find the closest element from `iter` matching `choice`. The `key` callback
/// is used to select a `&str` from the iterator to compare against `choice`.
pub fn closest<'a, T>(
choice: &str,
iter: impl Iterator<Item = T>,
key: impl Fn(&T) -> &'a str,
) -> Option<T> {
// Only consider candidates with an edit distance of 3 or less so we don't
// suggest out-of-the-blue options.
iter.filter_map(|e| Some((edit_distance(choice, key(&e), 3)?, e)))
.min_by_key(|t| t.0)
.map(|t| t.1)
}

/// Version of `closest` that returns a common "suggestion" that can be tacked
/// onto the end of an error message.
pub fn closest_msg<'a, T>(
choice: &str,
iter: impl Iterator<Item = T>,
key: impl Fn(&T) -> &'a str,
) -> String {
match closest(choice, iter, &key) {
Some(e) => format!("\n\n\tDid you mean `{}`?", key(&e)),
None => String::new(),
}
}

#[test]
fn test_edit_distance() {
use std::char::{from_u32, MAX};
// Test bytelength agnosticity
for c in (0u32..MAX as u32)
.filter_map(from_u32)
.map(|i| i.to_string())
{
assert_eq!(edit_distance(&c, &c, usize::MAX), Some(0));
}

let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
let c = "Mary häd ä little lämb\n\nLittle lämb\n";
assert_eq!(edit_distance(a, b, usize::MAX), Some(1));
assert_eq!(edit_distance(b, a, usize::MAX), Some(1));
assert_eq!(edit_distance(a, c, usize::MAX), Some(2));
assert_eq!(edit_distance(c, a, usize::MAX), Some(2));
assert_eq!(edit_distance(b, c, usize::MAX), Some(1));
assert_eq!(edit_distance(c, b, usize::MAX), Some(1));
}
93 changes: 0 additions & 93 deletions src/cargo/util/lev_distance.rs

This file was deleted.

Loading