Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/70 preview in fuzzy search #74

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ thiserror = { workspace = true }
walkdir = "2.3.3"
litt_shared = { path = "../shared" }
rayon = "1.8.0"
regex = "^1.0"
unicode-segmentation = "1.9.0"

[dev-dependencies]
once_cell = "1.17.1"
Expand Down
66 changes: 48 additions & 18 deletions index/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ use crate::Result;
use litt_shared::search_schema::SearchSchema;
use litt_shared::LITT_DIRECTORY_NAME;
use rayon::prelude::*;
use regex::Regex;
use std::collections::HashMap;
use std::convert::AsRef;
use std::fs::{create_dir_all, File};
use std::fs::{self, create_dir_all, File};
use std::io::{self, Read};
use std::path::{Path, PathBuf};
use std::process::Command;
Expand All @@ -18,6 +17,7 @@ use std::time::SystemTime;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, TantivyDocument};
use tantivy::{Index as TantivyIndex, IndexReader, IndexWriter, ReloadPolicy, Searcher};
use unicode_segmentation::UnicodeSegmentation;
use uuid::Uuid;
use walkdir::{DirEntry, WalkDir};

Expand All @@ -44,7 +44,7 @@ pub enum Index {
},
}

pub type PageIndex = HashMap<String, (u32, u32)>;
pub type PageIndex = HashMap<String, Vec<(u32, u32)>>;

impl Index {
pub fn create(path: impl AsRef<Path>, schema: SearchSchema) -> Result<Self> {
Expand Down Expand Up @@ -254,8 +254,14 @@ impl Index {
}
}

pub fn page_index(&self, uuid: Uuid) -> Result<PageIndex> {
todo!()
pub fn page_index(&self, path: &str) -> Result<PageIndex> {
let mut path = PathBuf::from(path);
path.set_extension("pageindex");
let data_str = fs::read_to_string(path.to_string_lossy().to_string())
.map_err(|e| CreationError(e.to_string()))?;
let fast_results: PageIndex =
serde_json::from_str(&data_str).map_err(|e| CreationError(e.to_string()))?;
Ok(fast_results)
}

fn create_index(path: &PathBuf, schema: Schema) -> Result<TantivyIndex> {
Expand Down Expand Up @@ -364,6 +370,7 @@ impl Index {
let page_body = std::fs::read_to_string(&page_path)
.map_err(|e| PdfParseError(e.to_string()))?;
self.add_page(dir_entry.path(), page_number, &page_path, &page_body)?;
Self::create_page_index(&mut page_path.clone(), &page_body)?;
}
}

Expand Down Expand Up @@ -392,6 +399,7 @@ impl Index {
.map_err(|e| TxtParseError(e.to_string() + full_path.to_string_lossy().as_ref()))?;
// Finally, add page
self.add_page(dir_entry.path(), page_number, &page_path, &body)?;
Self::create_page_index(&mut page_path.clone(), &body)?;
Ok(page_number)
}

Expand Down Expand Up @@ -481,18 +489,38 @@ impl Index {
}
}

fn split_text_into_words(text: &str) -> Vec<String> {
// Define a regular expression to remove all non-alphanumeric characters except spaces
let re = Regex::new(r"[^\w\s]").unwrap();

// Remove newlines and special characters from the text
let cleaned_text = re.replace_all(text, "");
fn create_page_index(path: &mut PathBuf, body: &str) -> Result<()> {
// Create reversed index map
let pindex: PageIndex = Self::split_text_into_words(body)?;
path.set_extension("pageindex");
georgbuechner marked this conversation as resolved.
Show resolved Hide resolved
let json_str = serde_json::to_string(&pindex).map_err(|e| CreationError(e.to_string()))?;
std::fs::write(path, json_str).map_err(|e| CreationError(e.to_string()))?;
Ok(())
}

// Split the cleaned text into words and collect them into a vector
cleaned_text
.split_whitespace()
.map(|s| s.to_string())
.collect()
fn split_text_into_words(body: &str) -> Result<PageIndex> {
Copy link
Collaborator

@SimonThormeyer SimonThormeyer Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the function creates and returns a PageIndex, this should probably be reflected in its name. I would also prefer another name for the argument, as it would work on any kind of text, not just the body of a pdf page.

let mut pindex: PageIndex = HashMap::new();
let mut i = 0;
let graphemes: Vec<&str> = body.graphemes(true).collect();
while i < graphemes.len() {
let mut buffer: String = "".to_string();
let mut j = i;
while j < graphemes.len() {
if graphemes[j].chars().all(|c| c.is_alphanumeric()) {
buffer += graphemes[j];
} else {
pindex
.entry(buffer.clone())
.or_default()
.push((i as u32, j as u32));
i = j;
break;
}
j += 1;
}
i += 1;
}
Ok(pindex)
}
}

Expand Down Expand Up @@ -571,7 +599,9 @@ mod tests {
#[test]
fn test_split_text_into_words() {
let text = "Hello*&%&^%, beautiful\n\rWörld!";
let result = Index::split_text_into_words(text);
assert_eq!(vec!["Hello", "beautiful", "Wörld"], result);
let result = Index::split_text_into_words(text).unwrap_or_default();
assert!(result.contains_key("Hello"));
assert!(result.contains_key("beautiful"));
assert!(result.contains_key("Wörld"));
}
}
135 changes: 91 additions & 44 deletions search/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ use tantivy::schema::Value;
use tantivy::{DocAddress, Snippet, SnippetGenerator, TantivyDocument};

extern crate litt_index;
use litt_index::index::Index;
use litt_index::index::{Index, PageIndex};
use litt_shared::search_schema::SearchSchema;

use crate::LittSearchError::SearchError;
use crate::Result;

use levenshtein::levenshtein;

const FUZZY_PREVIEW_NOT_FOUND: &str = "[fuzzy match] No preview. We're sry.";

#[derive(Debug, Clone, Copy)]
#[cfg_attr(test, derive(PartialEq))]
pub struct SearchResult {
Expand Down Expand Up @@ -155,20 +157,27 @@ impl Search {
)))?;
let text = fs::read_to_string(path).map_err(|e| SearchError(e.to_string()))?;

let _ = match search_term {
SearchTerm::Fuzzy(term, distance) =>
return self.get_fuzzy_preview(term, distance, text),
SearchTerm::Exact(term) =>
return self.get_preview_from_query(term, text)
};
match search_term {
SearchTerm::Fuzzy(term, distance) => {
for t in term.split(" ").collect::<Vec<&str>>() {
if let Ok(prev) = self.get_fuzzy_preview(path, t, distance, &text) {
return Ok(prev);
}
}
Ok(FUZZY_PREVIEW_NOT_FOUND.to_string())
}
SearchTerm::Exact(term) => self.get_preview_from_query(term, text),
}
}

fn get_preview_from_query(&self, term: &str, text: String) -> Result<String> {
let searcher = self
.index
.searcher()
.map_err(|e| SearchError(e.to_string()))?;
let query = self.index.query_parser()
let query = self
.index
.query_parser()
.map_err(|e| SearchError(e.to_string()))?
.parse_query(term)
.map_err(|e| SearchError(e.to_string()))?;
Expand All @@ -181,37 +190,63 @@ impl Search {
Ok(self.highlight(snippet))
}

fn get_fuzzy_preview(&self, term: &str, distance: &u8, body: String) -> Result<String> {
let pindex: HashMap<&str, Vec<(u32, u32)>> = HashMap::from([
("flooding", vec![(2, 10)])
]);
let (matched_term, start, end) = self.get_fuzzy_match(term, distance, pindex);
fn get_fuzzy_preview(
&self,
path: &str,
term: &str,
distance: &u8,
body: &str,
) -> Result<String> {
let pindex: PageIndex = self
.index
.page_index(path)
.map_err(|_| SearchError("".to_string()))?;
let (matched_term, start, end) = self
.get_fuzzy_match(term, distance, pindex)
.map_err(|_| SearchError("".to_string()))?;
// Another safe way to get substrings using char_indices
let start = body.char_indices().nth(start as usize).unwrap_or((0, ' ')).0;
let end = body.char_indices().nth(end as usize).unwrap_or((body.len()-1, ' ')).0;
let substring = &body[start..end];
Ok(substring.to_string().replace(&matched_term, &format!("**{}**", term)))
let start = body
.char_indices()
.nth(start.saturating_sub(20) as usize)
.unwrap_or((0, ' '))
.0;
let end = body
.char_indices()
.nth((end + 20) as usize)
.unwrap_or((body.len() - 1, ' '))
.0;
let substring = &format!("...{}...", &body[start..end]);
let substring = substring
.to_string()
.replace(&matched_term, &format!("**{}**", matched_term));
Ok(substring.replace('\n', " "))
}

fn get_fuzzy_match(&self, term: &str, distance: &u8, pindex: HashMap<&str, Vec<(u32, u32)>>) -> (String, u32, u32) {
fn get_fuzzy_match(
georgbuechner marked this conversation as resolved.
Show resolved Hide resolved
&self,
term: &str,
distance: &u8,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I get it correctly, this is the maximum allowed distance for a fuzzy match, right? If so, let's rename the argument to make the code more comprehensible.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would stick with distance, since it is the standard term related to levenstheindistance, what do you think?

pindex: PageIndex,
) -> Result<(String, u32, u32)> {
if pindex.contains_key(term) {
let (start, end) = pindex.get(term).unwrap().first().unwrap();
return (term.to_string(), *start, *end);
Ok((term.to_string(), *start, *end))
} else {
let mut cur: (String, u32, u32) = ("".to_string(), 0, 0);
let mut min_dist: usize = usize::MAX;
for (word, matches) in pindex {
let dist: usize = levenshtein(term, word);
let dist: usize = levenshtein(term, &word);
let dist = if word.contains(term) { 1 } else { dist };
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we overwrite the value in dist with 1 in this case?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that f.e. "Soledad" should be found when searching for "Sole". However, if you have a good reason for not doing so I'd also be fine.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did however change the syntax to a onliner

if dist < min_dist {
min_dist = dist;
min_dist = dist;
let (start, end) = matches.first().unwrap_or(&(0, 0));
cur = (word.to_string(), *start, *end)
}
}
if min_dist as u8 <= *distance {
return cur;
Ok(cur)
} else {
return ("".to_string(), 0, 0)
Err(SearchError("".to_string()))
}
}
}
Expand Down Expand Up @@ -317,44 +352,56 @@ mod tests {
}

fn test_fuzzy_search(search: &Search) {
let test_cases: HashMap<&str, Vec<u32>> = HashMap::from([
("Hündin", vec![1]),
("flooding", vec![2]),
("river", vec![1, 2]),
("branch", vec![2]),
("branch Sole", vec![1, 2]),
let test_cases: HashMap<&str, Vec<(u32, &str)>> = HashMap::from([
("Hello", vec![(1, "World"), (2, FUZZY_PREVIEW_NOT_FOUND)]),
("Hündin", vec![(1, "Bär")]),
("flooding", vec![(2, "winter’s")]),
("river", vec![(1, "drops"), (2, "foothill")]), // search result
("branch", vec![(2, "arch")]),
("branch Sole", vec![(1, "Salinas River"), (2, "arch")]),
// ("branch Sole", vec![2]), // Does not work. finds Soledad @ page 1
// ("branch Sole", vec![1]), // Does not work. finds branches @ page 1
("Soledad", vec![1]),
("Soledud Salinos", vec![1]), // actual fuzzy
// ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali']
("Soledad", vec![(1, "Salinas")]),
("Soledud", vec![(1, "River")]),
("Soledud Salinos", vec![(1, "the")]), // actual fuzzy
// ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali']
]);
// one-word search returning 1 result with 1 page
for (search_term, pages) in &test_cases {
println!("- [fuzzy] searching {}.", search_term);
let t_search_term = &SearchTerm::Fuzzy(search_term.to_string(), 2);
let results = search .search(t_search_term, 0, 10) .unwrap();
let results = search.search(t_search_term, 0, 10).unwrap();
if !pages.is_empty() {
assert!(results.contains_key(TEST_DOC_NAME));
let doc_results = results.get(TEST_DOC_NAME).unwrap();
assert_eq!(pages.len(), doc_results.len());
for page in pages {
for (page, _) in pages {
assert!(doc_results.iter().any(|result| result.page == *page));
}
} else {
assert!(!results.contains_key(TEST_DOC_NAME));
}
for (_, pages) in &results {
for page in pages {
let preview = match search.get_preview(page, &t_search_term) {
for page in doc_results {
println!(
"Getting preview: {} id:{},{}",
page.page, page.doc_id, page.segment_ord
);
let page_num: u32 = page.page;
let preview_part = pages
.iter()
.find(|&&(first, _)| first == page_num)
.map_or("pagenotfound", |&(_, part)| part);
let preview = match search.get_preview(page, t_search_term) {
Ok(preview) => preview,
Err(_) => "".to_string(),
Err(_) => FUZZY_PREVIEW_NOT_FOUND.to_string(),
};
println!("{}", preview);
assert!(preview.contains(search_term));
println!(
"Found preview \"{}\" should contain: {}",
preview, preview_part
);
assert!(preview.contains(preview_part));
println!("success");
}
} else {
assert!(!results.contains_key(TEST_DOC_NAME));
}

}
}

Expand Down
Loading