From 3bccc516651e3e0eaa38d03e4b6f19c6ec3db9be Mon Sep 17 00:00:00 2001 From: camilesing Date: Sat, 10 Jan 2026 20:26:38 +0800 Subject: [PATCH] faet: Enhance not found errors with suggestions --- rust/lance-core/src/datatypes/schema.rs | 53 +++++++-- rust/lance-core/src/levenshtein.rs | 149 ++++++++++++++++++++++++ rust/lance-core/src/lib.rs | 1 + rust/lance-index/src/lib.rs | 37 +++++- rust/lance-linalg/src/distance.rs | 13 ++- rust/lance-linalg/src/lib.rs | 2 +- rust/lance-namespace/src/error.rs | 66 +++++++++++ rust/test_error_suggestions.rs | 107 +++++++++++++++++ 8 files changed, 412 insertions(+), 16 deletions(-) create mode 100644 rust/lance-core/src/levenshtein.rs create mode 100644 rust/test_error_suggestions.rs diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index cf225fe2dd2..c3201a7380c 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -50,9 +50,18 @@ impl FieldRef<'_> { Ok(id) } FieldRef::ByPath(path) => { - let field = schema.field(path).ok_or_else(|| Error::InvalidInput { - source: format!("Field '{}' not found in schema", path).into(), - location: location!(), + let field = schema.field(path).ok_or_else(|| { + let paths = schema.field_paths(); + let field_paths: Vec<&str> = paths.iter().map(|s| s.as_str()).collect(); + let suggestion = crate::levenshtein::find_best_suggestion(path, &field_paths); + let mut error_msg = format!("Field '{}' not found in schema", path); + if let Some(suggestion) = suggestion { + error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion); + } + Error::InvalidInput { + source: error_msg.into(), + location: location!(), + } })?; Ok(field.id) } @@ -331,6 +340,27 @@ impl Schema { SchemaFieldIterPreOrder::new(self) } + /// Get all field paths in the schema as a list of strings. + /// + /// This returns all field paths in the schema, including nested fields. + /// For example, if there's a struct field "user" with a field "name", + /// this will return "user.name" as one of the paths. + pub fn field_paths(&self) -> Vec { + let mut paths = Vec::new(); + for field in self.fields_pre_order() { + let ancestry = self.field_ancestry_by_id(field.id); + if let Some(ancestry) = ancestry { + let path = ancestry + .iter() + .map(|f| f.name.as_str()) + .collect::>() + .join("."); + paths.push(path); + } + } + paths + } + /// Returns a new schema that only contains the fields in `column_ids`. /// /// This projection can filter out both top-level and nested fields @@ -491,12 +521,19 @@ impl Schema { // TODO: This is not a public API, change to pub(crate) after refactor is done. pub fn field_id(&self, column: &str) -> Result { - self.field(column) - .map(|f| f.id) - .ok_or_else(|| Error::Schema { - message: "Vector column not in schema".to_string(), + self.field(column).map(|f| f.id).ok_or_else(|| { + let paths = self.field_paths(); + let field_paths: Vec<&str> = paths.iter().map(|s| s.as_str()).collect(); + let suggestion = crate::levenshtein::find_best_suggestion(column, &field_paths); + let mut error_msg = format!("Vector column '{}' not in schema", column); + if let Some(suggestion) = suggestion { + error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion); + } + Error::Schema { + message: error_msg.to_string(), location: location!(), - }) + } + }) } pub fn top_level_field_ids(&self) -> Vec { diff --git a/rust/lance-core/src/levenshtein.rs b/rust/lance-core/src/levenshtein.rs new file mode 100644 index 00000000000..cc4aa22aae5 --- /dev/null +++ b/rust/lance-core/src/levenshtein.rs @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +/// Calculate the Levenshtein distance between two strings. +/// +/// The Levenshtein distance is a measure of the number of single-character edits +/// (insertions, deletions, or substitutions) required to change one word into the other. +/// +/// # Examples +/// +/// ``` +/// use lance_core::levenshtein::levenshtein_distance; +/// +/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3); +/// assert_eq!(levenshtein_distance("hello", "hello"), 0); +/// assert_eq!(levenshtein_distance("hello", "world"), 4); +/// ``` +pub fn levenshtein_distance(s1: &str, s2: &str) -> usize { + let s1_len = s1.chars().count(); + let s2_len = s2.chars().count(); + + // If one of the strings is empty, the distance is the length of the other + if s1_len == 0 { + return s2_len; + } + if s2_len == 0 { + return s1_len; + } + + // Create a matrix to store the distances + let mut matrix = vec![vec![0; s2_len + 1]; s1_len + 1]; + + // Initialize the first row and column + for i in 0..=s1_len { + matrix[i][0] = i; + } + for j in 0..=s2_len { + matrix[0][j] = j; + } + + // Fill the matrix + let s1_chars: Vec = s1.chars().collect(); + let s2_chars: Vec = s2.chars().collect(); + + for i in 1..=s1_len { + for j in 1..=s2_len { + let cost = if s1_chars[i - 1] == s2_chars[j - 1] { + 0 + } else { + 1 + }; + matrix[i][j] = std::cmp::min( + std::cmp::min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1), + matrix[i - 1][j - 1] + cost, + ); + } + } + + matrix[s1_len][s2_len] +} + +/// Find the best suggestion from a list of options based on Levenshtein distance. +/// +/// Returns `Some(suggestion)` if there's an option where the Levenshtein distance +/// is less than 1/3 of the length of the input string. +/// Otherwise returns `None`. +/// +/// # Examples +/// +/// ``` +/// use lance_core::levenshtein::find_best_suggestion; +/// +/// let options = vec!["vector", "vector", "vector"]; +/// assert_eq!(find_best_suggestion("vacter", &options), Some("vector")); +/// assert_eq!(find_best_suggestion("hello", &options), None); +/// ``` +pub fn find_best_suggestion(input: &str, options: &[&str]) -> Option { + let input_len = input.chars().count(); + if input_len == 0 { + return None; + } + + let threshold = input_len / 3; + let mut best_option: Option<(String, usize)> = None; + + for option in options { + let distance = levenshtein_distance(input, option); + if distance <= threshold { + match &best_option { + None => best_option = Some((option.to_string(), distance)), + Some((_, best_distance)) => { + if distance < *best_distance { + best_option = Some((option.to_string(), distance)); + } + } + } + } + } + + best_option.map(|(option, _)| option) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_levenshtein_distance() { + assert_eq!(levenshtein_distance("", ""), 0); + assert_eq!(levenshtein_distance("a", ""), 1); + assert_eq!(levenshtein_distance("", "a"), 1); + assert_eq!(levenshtein_distance("abc", "abc"), 0); + assert_eq!(levenshtein_distance("kitten", "sitting"), 3); + assert_eq!(levenshtein_distance("hello", "world"), 4); + assert_eq!(levenshtein_distance("vector", "vector"), 1); + assert_eq!(levenshtein_distance("vector", "vector"), 1); + assert_eq!(levenshtein_distance("vacter", "vector"), 2); + } + + #[test] + fn test_find_best_suggestion() { + let options = vec!["vector", "vector", "vector", "column", "table"]; + + assert_eq!( + find_best_suggestion("vacter", &options), + Some("vector".to_string()) + ); + assert_eq!( + find_best_suggestion("vectr", &options), + Some("vector".to_string()) + ); + assert_eq!( + find_best_suggestion("column", &options), + Some("column".to_string()) + ); + assert_eq!( + find_best_suggestion("tble", &options), + Some("table".to_string()) + ); + + // Should return None if no good match + assert_eq!(find_best_suggestion("hello", &options), None); + assert_eq!(find_best_suggestion("world", &options), None); + + // Should return None if input is too short + assert_eq!(find_best_suggestion("v", &options), None); + assert_eq!(find_best_suggestion("", &options), None); + } +} diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 8c669eda223..0860f710c84 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -9,6 +9,7 @@ pub mod cache; pub mod container; pub mod datatypes; pub mod error; +pub mod levenshtein; pub mod traits; pub mod utils; diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 5ed4638b6cb..369e922d1ca 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -207,10 +207,39 @@ impl TryFrom<&str> for IndexType { "IVF_HNSW_FLAT" => Ok(Self::IvfHnswFlat), "IVF_HNSW_SQ" => Ok(Self::IvfHnswSq), "IVF_HNSW_PQ" => Ok(Self::IvfHnswPq), - _ => Err(Error::invalid_input( - format!("invalid index type: {}", value), - location!(), - )), + _ => { + let valid_index_types = vec![ + "BTree", + "Bitmap", + "LabelList", + "Inverted", + "NGram", + "FragmentReuse", + "MemWal", + "ZoneMap", + "Vector", + "IVF_FLAT", + "IVF_SQ", + "IVF_PQ", + "IVF_RQ", + "IVF_HNSW_FLAT", + "IVF_HNSW_SQ", + "IVF_HNSW_PQ", + ]; + let suggestion = + lance_core::levenshtein::find_best_suggestion(value, &valid_index_types); + let mut error_msg = format!("invalid index type: {}", value); + if let Some(suggestion) = suggestion { + error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion); + } + Err(Error::invalid_input(error_msg, location!())); + lance_core::levenshtein::find_best_suggestion(value, &valid_index_types); + let mut error_msg = format!("invalid index type: {}", value); + if let Some(suggestion) = suggestion { + error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion); + } + Err(Error::invalid_input(error_msg, location!())) + } } } } diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs index 84c81fe85ed..370f472dc65 100644 --- a/rust/lance-linalg/src/distance.rs +++ b/rust/lance-linalg/src/distance.rs @@ -97,9 +97,16 @@ impl TryFrom<&str> for DistanceType { "cosine" => Ok(Self::Cosine), "dot" => Ok(Self::Dot), "hamming" => Ok(Self::Hamming), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Metric type '{s}' is not supported" - ))), + _ => { + let valid_distance_types = vec!["l2", "euclidean", "cosine", "dot", "hamming"]; + let suggestion = + lance_core::levenshtein::find_best_suggestion(s, &valid_distance_types); + let mut error_msg = format!("Metric type '{s}' is not supported"); + if let Some(suggestion) = suggestion { + error_msg = format!("{}. Did you mean '{}'?", error_msg, suggestion); + } + Err(ArrowError::InvalidArgumentError(error_msg)) + } } } } diff --git a/rust/lance-linalg/src/lib.rs b/rust/lance-linalg/src/lib.rs index 0d7654cb7cf..3e8988a85eb 100644 --- a/rust/lance-linalg/src/lib.rs +++ b/rust/lance-linalg/src/lib.rs @@ -18,5 +18,5 @@ pub(crate) mod test_utils; pub use clustering::Clustering; -type Error = ArrowError; +use lance_core::Error; pub type Result = std::result::Result; diff --git a/rust/lance-namespace/src/error.rs b/rust/lance-namespace/src/error.rs index 71fb7c12c31..693b6abf08f 100644 --- a/rust/lance-namespace/src/error.rs +++ b/rust/lance-namespace/src/error.rs @@ -145,6 +145,34 @@ impl std::fmt::Display for ErrorCode { } } +/// Error for table column not found with suggestion. +#[derive(Debug, Clone)] +pub struct TableColumnNotFoundWithSuggestionError { + message: String, + suggestion: Option, +} + +impl TableColumnNotFoundWithSuggestionError { + pub fn new(message: String, suggestion: Option) -> Self { + Self { + message, + suggestion, + } + } +} + +impl std::fmt::Display for TableColumnNotFoundWithSuggestionError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Table column not found: {}", self.message)?; + if let Some(suggestion) = &self.suggestion { + write!(f, ". Did you mean '{}'?", suggestion)?; + } + Ok(()) + } +} + +impl std::error::Error for TableColumnNotFoundWithSuggestionError {} + /// Lance Namespace error type. /// /// This enum provides fine-grained error types for Lance Namespace operations. @@ -222,6 +250,12 @@ pub enum NamespaceError { #[snafu(display("Table column not found: {message}"))] TableColumnNotFound { message: String }, + /// The specified table column does not exist with suggestions. + #[snafu(transparent)] + TableColumnNotFoundWithSuggestion { + source: TableColumnNotFoundWithSuggestionError, + }, + /// Malformed request or invalid parameters. #[snafu(display("Invalid input: {message}"))] InvalidInput { message: String }, @@ -274,6 +308,7 @@ impl NamespaceError { Self::TransactionNotFound { .. } => ErrorCode::TransactionNotFound, Self::TableVersionNotFound { .. } => ErrorCode::TableVersionNotFound, Self::TableColumnNotFound { .. } => ErrorCode::TableColumnNotFound, + Self::TableColumnNotFoundWithSuggestion { .. } => ErrorCode::TableColumnNotFound, Self::InvalidInput { .. } => ErrorCode::InvalidInput, Self::ConcurrentModification { .. } => ErrorCode::ConcurrentModification, Self::PermissionDenied { .. } => ErrorCode::PermissionDenied, @@ -317,6 +352,16 @@ impl NamespaceError { None => Self::Internal { message }, } } + + /// Creates a NamespaceError for table column not found with suggestion. + pub fn table_column_not_found_with_suggestion( + message: impl Into, + suggestion: Option, + ) -> Self { + Self::TableColumnNotFoundWithSuggestion { + source: TableColumnNotFoundWithSuggestionError::new(message.into(), suggestion), + } + } } /// Converts a NamespaceError into a lance_core::Error. @@ -401,4 +446,25 @@ mod tests { }; assert_eq!(err.to_string(), "Table not found: users"); } + + #[test] + fn test_table_column_not_found_with_suggestion() { + let err = NamespaceError::table_column_not_found_with_suggestion( + "column_name".to_string(), + Some("column_name_suggestion".to_string()), + ); + assert_eq!(err.code(), ErrorCode::TableColumnNotFound); + assert_eq!( + err.to_string(), + "Table column not found: column_name. Did you mean 'column_name_suggestion'?" + ); + } + + #[test] + fn test_table_column_not_found_without_suggestion() { + let err = + NamespaceError::table_column_not_found_with_suggestion("column_name".to_string(), None); + assert_eq!(err.code(), ErrorCode::TableColumnNotFound); + assert_eq!(err.to_string(), "Table column not found: column_name"); + } } diff --git a/rust/test_error_suggestions.rs b/rust/test_error_suggestions.rs new file mode 100644 index 00000000000..4b6702f4aa8 --- /dev/null +++ b/rust/test_error_suggestions.rs @@ -0,0 +1,107 @@ +// Test file for error suggestions +use lance_core::{levenshtein::find_best_suggestion, DataType, Field, Schema}; +use lance_index::IndexType; +use lance_linalg::DistanceType; + +fn test_column_not_found() { + println!("=== Testing Column Not Found Errors ==="); + + // Create a schema with some fields + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("vector", DataType::Float32, false), + Field::new("text", DataType::Utf8, false), + Field::new( + "metadata", + DataType::Struct( + vec![ + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Float32, false), + ] + .into(), + ), + false, + ), + ]); + + // Test 1: Incorrect field name with suggestion + let result = schema.field_id("vacter"); + println!("Test 1 - Field 'vacter': {}", result.unwrap_err()); + + // Test 2: Incorrect nested field name + let result = schema.field_id("metadata.name"); + println!("Test 2 - Field 'metadata.name': {}", result.unwrap_err()); + + // Test 3: Completely wrong name + let result = schema.field_id("completely_wrong"); + println!("Test 3 - Field 'completely_wrong': {}", result.unwrap_err()); +} + +fn test_distance_type_not_found() { + println!("\n=== Testing Distance Type Not Found Errors ==="); + + // Test 1: Misspelled distance type with suggestion + let result = DistanceType::try_from("l1"); + println!("Test 1 - Distance 'l1': {}", result.unwrap_err()); + + // Test 2: Another misspelled distance type + let result = DistanceType::try_from("cosin"); + println!("Test 2 - Distance 'cosin': {}", result.unwrap_err()); + + // Test 3: Completely wrong distance type + let result = DistanceType::try_from("wrong_distance"); + println!( + "Test 3 - Distance 'wrong_distance': {}", + result.unwrap_err() + ); +} + +fn test_index_type_not_found() { + println!("\n=== Testing Index Type Not Found Errors ==="); + + // Test 1: Misspelled index type with suggestion + let result = IndexType::try_from("Btree"); + println!("Test 1 - Index 'Btree': {}", result.unwrap_err()); + + // Test 2: Another misspelled index type + let result = IndexType::try_from("Vectr"); + println!("Test 2 - Index 'Vectr': {}", result.unwrap_err()); + + // Test 3: Completely wrong index type + let result = IndexType::try_from("wrong_index"); + println!("Test 3 - Index 'wrong_index': {}", result.unwrap_err()); +} + +fn test_levenshtein_suggestion() { + println!("\n=== Testing Levenshtein Suggestion Function ==="); + + let options = vec!["vector", "id", "text", "metadata.name"]; + + // Test with 1 character difference + let suggestion = find_best_suggestion("vacter", &options); + println!("Test 1 - 'vacter' -> {:?}", suggestion); + + // Test with 2 character differences + let suggestion = find_best_suggestion("vecor", &options); + println!("Test 2 - 'vecor' -> {:?}", suggestion); + + // Test with more than 1/3 characters different + let suggestion = find_best_suggestion("vctr", &options); + println!("Test 3 - 'vctr' -> {:?}", suggestion); + + // Test exact match + let suggestion = find_best_suggestion("vector", &options); + println!("Test 4 - 'vector' -> {:?}", suggestion); +} + +fn main() { + println!("Testing Enhanced Error Messages with Suggestions"); + println!("=============================================="); + + test_levenshtein_suggestion(); + test_column_not_found(); + test_distance_type_not_found(); + test_index_type_not_found(); + + println!("\nAll tests completed!"); +}