From 142843bcef1634248ecc03d1889da98ceb6ad447 Mon Sep 17 00:00:00 2001 From: fux Date: Mon, 19 Aug 2024 21:56:00 +0200 Subject: [PATCH 01/11] Workin on #70 - WIP page-indexing --- index/Cargo.toml | 1 + index/src/index.rs | 25 +++++++++++++++++++++++++ search/src/search.rs | 16 +++++++++++++--- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/index/Cargo.toml b/index/Cargo.toml index 19412c6..5d7dac2 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -13,6 +13,7 @@ thiserror = { workspace = true } walkdir = "2.3.3" litt_shared = { path = "../shared" } rayon = "1.8.0" +regex = "^1.0" [dev-dependencies] once_cell = "1.17.1" diff --git a/index/src/index.rs b/index/src/index.rs index 0a63e17..2a0ed27 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -6,6 +6,7 @@ use crate::Result; use litt_shared::search_schema::SearchSchema; use litt_shared::LITT_DIRECTORY_NAME; use rayon::prelude::*; +use regex::Regex; use std::collections::HashMap; use std::convert::AsRef; use std::fs::{create_dir_all, File}; @@ -473,6 +474,21 @@ impl Index { Ok(false) } } + + + fn split_text_into_words(text: &str) -> Vec { + // Define a regular expression to remove all non-alphanumeric characters except spaces + let re = Regex::new(r"[^\w\s]").unwrap(); + + // Remove newlines and special characters from the text + let cleaned_text = re.replace_all(text, ""); + + // Split the cleaned text into words and collect them into a vector + cleaned_text + .split_whitespace() + .map(|s| s.to_string()) + .collect() + } } #[cfg(test)] @@ -546,4 +562,13 @@ mod tests { .is_dir()) }); } + + #[test] + #[serial] + fn test_split_text_into_words() { + run_test(|| { + + }); + } + } diff --git a/search/src/search.rs b/search/src/search.rs index b3f000b..be1b24e 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -286,9 +286,8 @@ mod tests { // one-word search returning 1 result with 1 page for (search_term, pages) in &test_cases { println!("- [fuzzy] searching {}.", search_term); - let results = search - .search(&SearchTerm::Fuzzy(search_term.to_string(), 2), 0, 10) - .unwrap(); + let t_search_term = &SearchTerm::Fuzzy(search_term.to_string(), 2); + let results = search .search(t_search_term, 0, 10) .unwrap(); if !pages.is_empty() { assert!(results.contains_key(TEST_DOC_NAME)); let doc_results = results.get(TEST_DOC_NAME).unwrap(); @@ -299,6 +298,17 @@ mod tests { } else { assert!(!results.contains_key(TEST_DOC_NAME)); } + for (_, pages) in &results { + for page in pages { + let preview = match search.get_preview(page, &t_search_term) { + Ok(preview) => preview, + Err(_) => "".to_string(), + }; + println!("{}", preview); + assert!(preview.contains(search_term)); + } + } + } } From ea09df3cf316cf7b157e1e4bbac316d0dc2a290b Mon Sep 17 00:00:00 2001 From: SimonThormeyer Date: Mon, 19 Aug 2024 22:51:58 +0200 Subject: [PATCH 02/11] feat: add index function --- index/src/index.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/index/src/index.rs b/index/src/index.rs index 2a0ed27..6be26b5 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -44,6 +44,8 @@ pub enum Index { }, } +pub type PageIndex = HashMap; + impl Index { pub fn create(path: impl AsRef, schema: SearchSchema) -> Result { let documents_path = PathBuf::from(path.as_ref()); @@ -252,6 +254,10 @@ impl Index { } } + pub fn page_index(&self, uuid: Uuid) -> Result { + todo!() + } + fn create_index(path: &PathBuf, schema: Schema) -> Result { TantivyIndex::create_in_dir(path, schema).map_err(|e| CreationError(e.to_string())) } @@ -475,14 +481,13 @@ impl Index { } } - fn split_text_into_words(text: &str) -> Vec { // Define a regular expression to remove all non-alphanumeric characters except spaces let re = Regex::new(r"[^\w\s]").unwrap(); - + // Remove newlines and special characters from the text let cleaned_text = re.replace_all(text, ""); - + // Split the cleaned text into words and collect them into a vector cleaned_text .split_whitespace() @@ -564,11 +569,9 @@ mod tests { } #[test] - #[serial] fn test_split_text_into_words() { - run_test(|| { - - }); + let text = "Hello*&%&^%, beautiful\n\rWörld!"; + let result = Index::split_text_into_words(text); + assert_eq!(vec!["Hello", "beautiful", "Wörld"], result); } - } From d824f8dfb5968534bfd8ecd3d9a04b0558ffcc4c Mon Sep 17 00:00:00 2001 From: fux Date: Mon, 19 Aug 2024 23:32:14 +0200 Subject: [PATCH 03/11] Working on #70 -- Adds finding preview with levensthtein distance --- search/Cargo.toml | 2 +- search/src/search.rs | 70 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/search/Cargo.toml b/search/Cargo.toml index 0f93c2b..b1865fc 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -11,4 +11,4 @@ tantivy = { workspace = true } thiserror = { workspace = true } litt_shared = { path = "../shared" } litt_index = { path = "../index" } - +levenshtein = "1.0.5" diff --git a/search/src/search.rs b/search/src/search.rs index be1b24e..a6f7666 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -11,6 +11,8 @@ use litt_shared::search_schema::SearchSchema; use crate::LittSearchError::SearchError; use crate::Result; +use levenshtein::levenshtein; + #[derive(Debug, Clone, Copy)] #[cfg_attr(test, derive(PartialEq))] pub struct SearchResult { @@ -134,17 +136,6 @@ impl Search { .index .searcher() .map_err(|e| SearchError(e.to_string()))?; - let (query_parser, term) = match search_term { - SearchTerm::Fuzzy(_, _) => return Ok("[fuzzy match] No preview. We're sry.".into()), - SearchTerm::Exact(term) => (self.index.query_parser(), term), - }; - let query = query_parser - .map_err(|e| SearchError(e.to_string()))? - .parse_query(term) - .map_err(|e| SearchError(e.to_string()))?; - let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, self.schema.body) - .map_err(|e| SearchError(e.to_string()))?; - snippet_generator.set_max_num_chars(70); let retrieved_doc: TantivyDocument = searcher .doc(DocAddress { segment_ord: (search_result.segment_ord), @@ -164,12 +155,67 @@ impl Search { )))?; let text = fs::read_to_string(path).map_err(|e| SearchError(e.to_string()))?; - // Generate snippet + let _ = match search_term { + SearchTerm::Fuzzy(term, distance) => + return self.get_fuzzy_preview(term, distance, text), + SearchTerm::Exact(term) => + return self.get_preview_from_query(term, text) + }; + } + + fn get_preview_from_query(&self, term: &str, text: String) -> Result { + let searcher = self + .index + .searcher() + .map_err(|e| SearchError(e.to_string()))?; + let query = self.index.query_parser() + .map_err(|e| SearchError(e.to_string()))? + .parse_query(term) + .map_err(|e| SearchError(e.to_string()))?; + let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, self.schema.body) + .map_err(|e| SearchError(e.to_string()))?; + snippet_generator.set_max_num_chars(70); + let snippet = snippet_generator.snippet(&text); // let snippet = snippet_generator.snippet_from_doc(&retrieved_doc); Ok(self.highlight(snippet)) } + fn get_fuzzy_preview(&self, term: &str, distance: &u8, body: String) -> Result { + let pindex: HashMap<&str, Vec<(u32, u32)>> = HashMap::from([ + ("flooding", vec![(2, 10)]) + ]); + let (matched_term, start, end) = self.get_fuzzy_match(term, distance, pindex); + // Another safe way to get substrings using char_indices + let start = body.char_indices().nth(start as usize).unwrap_or((0, ' ')).0; + let end = body.char_indices().nth(end as usize).unwrap_or((body.len()-1, ' ')).0; + let substring = &body[start..end]; + Ok(substring.to_string().replace(&matched_term, &format!("**{}**", term))) + } + + fn get_fuzzy_match(&self, term: &str, distance: &u8, pindex: HashMap<&str, Vec<(u32, u32)>>) -> (String, u32, u32) { + if pindex.contains_key(term) { + let (start, end) = pindex.get(term).unwrap().first().unwrap(); + return (term.to_string(), *start, *end); + } else { + let mut cur: (String, u32, u32) = ("".to_string(), 0, 0); + let mut min_dist: usize = usize::MAX; + for (word, matches) in pindex { + let dist: usize = levenshtein(term, word); + if dist < min_dist { + min_dist = dist; + let (start, end) = matches.first().unwrap_or(&(0, 0)); + cur = (word.to_string(), *start, *end) + } + } + if min_dist as u8 <= *distance { + return cur; + } else { + return ("".to_string(), 0, 0) + } + } + } + fn highlight(&self, snippet: Snippet) -> String { let mut result = String::new(); let mut start_from = 0; From f85eacfbb502184a3be5c57039533f1eb705c54e Mon Sep 17 00:00:00 2001 From: fux Date: Tue, 20 Aug 2024 04:10:04 +0200 Subject: [PATCH 04/11] Fixes #70 - implements preview for fuzzy search --- index/Cargo.toml | 2 +- index/src/index.rs | 66 +++++++++++++++------ search/src/search.rs | 135 +++++++++++++++++++++++++++++-------------- 3 files changed, 140 insertions(+), 63 deletions(-) diff --git a/index/Cargo.toml b/index/Cargo.toml index 5d7dac2..8c76f3d 100644 --- a/index/Cargo.toml +++ b/index/Cargo.toml @@ -13,7 +13,7 @@ thiserror = { workspace = true } walkdir = "2.3.3" litt_shared = { path = "../shared" } rayon = "1.8.0" -regex = "^1.0" +unicode-segmentation = "1.9.0" [dev-dependencies] once_cell = "1.17.1" diff --git a/index/src/index.rs b/index/src/index.rs index 6be26b5..1dbc681 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -6,10 +6,9 @@ use crate::Result; use litt_shared::search_schema::SearchSchema; use litt_shared::LITT_DIRECTORY_NAME; use rayon::prelude::*; -use regex::Regex; use std::collections::HashMap; use std::convert::AsRef; -use std::fs::{create_dir_all, File}; +use std::fs::{self, create_dir_all, File}; use std::io::{self, Read}; use std::path::{Path, PathBuf}; use std::process::Command; @@ -18,6 +17,7 @@ use std::time::SystemTime; use tantivy::query::QueryParser; use tantivy::schema::{Schema, TantivyDocument}; use tantivy::{Index as TantivyIndex, IndexReader, IndexWriter, ReloadPolicy, Searcher}; +use unicode_segmentation::UnicodeSegmentation; use uuid::Uuid; use walkdir::{DirEntry, WalkDir}; @@ -44,7 +44,7 @@ pub enum Index { }, } -pub type PageIndex = HashMap; +pub type PageIndex = HashMap>; impl Index { pub fn create(path: impl AsRef, schema: SearchSchema) -> Result { @@ -254,8 +254,14 @@ impl Index { } } - pub fn page_index(&self, uuid: Uuid) -> Result { - todo!() + pub fn page_index(&self, path: &str) -> Result { + let mut path = PathBuf::from(path); + path.set_extension("pageindex"); + let data_str = fs::read_to_string(path.to_string_lossy().to_string()) + .map_err(|e| CreationError(e.to_string()))?; + let fast_results: PageIndex = + serde_json::from_str(&data_str).map_err(|e| CreationError(e.to_string()))?; + Ok(fast_results) } fn create_index(path: &PathBuf, schema: Schema) -> Result { @@ -364,6 +370,7 @@ impl Index { let page_body = std::fs::read_to_string(&page_path) .map_err(|e| PdfParseError(e.to_string()))?; self.add_page(dir_entry.path(), page_number, &page_path, &page_body)?; + Self::create_page_index(&mut page_path.clone(), &page_body)?; } } @@ -392,6 +399,7 @@ impl Index { .map_err(|e| TxtParseError(e.to_string() + full_path.to_string_lossy().as_ref()))?; // Finally, add page self.add_page(dir_entry.path(), page_number, &page_path, &body)?; + Self::create_page_index(&mut page_path.clone(), &body)?; Ok(page_number) } @@ -481,18 +489,38 @@ impl Index { } } - fn split_text_into_words(text: &str) -> Vec { - // Define a regular expression to remove all non-alphanumeric characters except spaces - let re = Regex::new(r"[^\w\s]").unwrap(); - - // Remove newlines and special characters from the text - let cleaned_text = re.replace_all(text, ""); + fn create_page_index(path: &mut PathBuf, body: &str) -> Result<()> { + // Create reversed index map + let pindex: PageIndex = Self::split_text_into_words(body)?; + path.set_extension("pageindex"); + let json_str = serde_json::to_string(&pindex).map_err(|e| CreationError(e.to_string()))?; + std::fs::write(path, json_str).map_err(|e| CreationError(e.to_string()))?; + Ok(()) + } - // Split the cleaned text into words and collect them into a vector - cleaned_text - .split_whitespace() - .map(|s| s.to_string()) - .collect() + fn split_text_into_words(body: &str) -> Result { + let mut pindex: PageIndex = HashMap::new(); + let mut i = 0; + let graphemes: Vec<&str> = body.graphemes(true).collect(); + while i < graphemes.len() { + let mut buffer: String = "".to_string(); + let mut j = i; + while j < graphemes.len() { + if graphemes[j].chars().all(|c| c.is_alphanumeric()) { + buffer += graphemes[j]; + } else { + pindex + .entry(buffer.clone()) + .or_default() + .push((i as u32, j as u32)); + i = j; + break; + } + j += 1; + } + i += 1; + } + Ok(pindex) } } @@ -571,7 +599,9 @@ mod tests { #[test] fn test_split_text_into_words() { let text = "Hello*&%&^%, beautiful\n\rWörld!"; - let result = Index::split_text_into_words(text); - assert_eq!(vec!["Hello", "beautiful", "Wörld"], result); + let result = Index::split_text_into_words(text).unwrap_or_default(); + assert!(result.contains_key("Hello")); + assert!(result.contains_key("beautiful")); + assert!(result.contains_key("Wörld")); } } diff --git a/search/src/search.rs b/search/src/search.rs index a6f7666..569611d 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -5,7 +5,7 @@ use tantivy::schema::Value; use tantivy::{DocAddress, Snippet, SnippetGenerator, TantivyDocument}; extern crate litt_index; -use litt_index::index::Index; +use litt_index::index::{Index, PageIndex}; use litt_shared::search_schema::SearchSchema; use crate::LittSearchError::SearchError; @@ -13,6 +13,8 @@ use crate::Result; use levenshtein::levenshtein; +const FUZZY_PREVIEW_NOT_FOUND: &str = "[fuzzy match] No preview. We're sry."; + #[derive(Debug, Clone, Copy)] #[cfg_attr(test, derive(PartialEq))] pub struct SearchResult { @@ -155,12 +157,17 @@ impl Search { )))?; let text = fs::read_to_string(path).map_err(|e| SearchError(e.to_string()))?; - let _ = match search_term { - SearchTerm::Fuzzy(term, distance) => - return self.get_fuzzy_preview(term, distance, text), - SearchTerm::Exact(term) => - return self.get_preview_from_query(term, text) - }; + match search_term { + SearchTerm::Fuzzy(term, distance) => { + for t in term.split(" ").collect::>() { + if let Ok(prev) = self.get_fuzzy_preview(path, t, distance, &text) { + return Ok(prev); + } + } + Ok(FUZZY_PREVIEW_NOT_FOUND.to_string()) + } + SearchTerm::Exact(term) => self.get_preview_from_query(term, text), + } } fn get_preview_from_query(&self, term: &str, text: String) -> Result { @@ -168,7 +175,9 @@ impl Search { .index .searcher() .map_err(|e| SearchError(e.to_string()))?; - let query = self.index.query_parser() + let query = self + .index + .query_parser() .map_err(|e| SearchError(e.to_string()))? .parse_query(term) .map_err(|e| SearchError(e.to_string()))?; @@ -181,37 +190,63 @@ impl Search { Ok(self.highlight(snippet)) } - fn get_fuzzy_preview(&self, term: &str, distance: &u8, body: String) -> Result { - let pindex: HashMap<&str, Vec<(u32, u32)>> = HashMap::from([ - ("flooding", vec![(2, 10)]) - ]); - let (matched_term, start, end) = self.get_fuzzy_match(term, distance, pindex); + fn get_fuzzy_preview( + &self, + path: &str, + term: &str, + distance: &u8, + body: &str, + ) -> Result { + let pindex: PageIndex = self + .index + .page_index(path) + .map_err(|_| SearchError("".to_string()))?; + let (matched_term, start, end) = self + .get_fuzzy_match(term, distance, pindex) + .map_err(|_| SearchError("".to_string()))?; // Another safe way to get substrings using char_indices - let start = body.char_indices().nth(start as usize).unwrap_or((0, ' ')).0; - let end = body.char_indices().nth(end as usize).unwrap_or((body.len()-1, ' ')).0; - let substring = &body[start..end]; - Ok(substring.to_string().replace(&matched_term, &format!("**{}**", term))) + let start = body + .char_indices() + .nth(start.saturating_sub(20) as usize) + .unwrap_or((0, ' ')) + .0; + let end = body + .char_indices() + .nth((end + 20) as usize) + .unwrap_or((body.len() - 1, ' ')) + .0; + let substring = &format!("...{}...", &body[start..end]); + let substring = substring + .to_string() + .replace(&matched_term, &format!("**{}**", matched_term)); + Ok(substring.replace('\n', " ")) } - fn get_fuzzy_match(&self, term: &str, distance: &u8, pindex: HashMap<&str, Vec<(u32, u32)>>) -> (String, u32, u32) { + fn get_fuzzy_match( + &self, + term: &str, + distance: &u8, + pindex: PageIndex, + ) -> Result<(String, u32, u32)> { if pindex.contains_key(term) { let (start, end) = pindex.get(term).unwrap().first().unwrap(); - return (term.to_string(), *start, *end); + Ok((term.to_string(), *start, *end)) } else { let mut cur: (String, u32, u32) = ("".to_string(), 0, 0); let mut min_dist: usize = usize::MAX; for (word, matches) in pindex { - let dist: usize = levenshtein(term, word); + let dist: usize = levenshtein(term, &word); + let dist = if word.contains(term) { 1 } else { dist }; if dist < min_dist { - min_dist = dist; + min_dist = dist; let (start, end) = matches.first().unwrap_or(&(0, 0)); cur = (word.to_string(), *start, *end) } } if min_dist as u8 <= *distance { - return cur; + Ok(cur) } else { - return ("".to_string(), 0, 0) + Err(SearchError("".to_string())) } } } @@ -317,44 +352,56 @@ mod tests { } fn test_fuzzy_search(search: &Search) { - let test_cases: HashMap<&str, Vec> = HashMap::from([ - ("Hündin", vec![1]), - ("flooding", vec![2]), - ("river", vec![1, 2]), - ("branch", vec![2]), - ("branch Sole", vec![1, 2]), + let test_cases: HashMap<&str, Vec<(u32, &str)>> = HashMap::from([ + ("Hello", vec![(1, "World"), (2, FUZZY_PREVIEW_NOT_FOUND)]), + ("Hündin", vec![(1, "Bär")]), + ("flooding", vec![(2, "winter’s")]), + ("river", vec![(1, "drops"), (2, "foothill")]), // search result + ("branch", vec![(2, "arch")]), + ("branch Sole", vec![(1, "Salinas River"), (2, "arch")]), // ("branch Sole", vec![2]), // Does not work. finds Soledad @ page 1 // ("branch Sole", vec![1]), // Does not work. finds branches @ page 1 - ("Soledad", vec![1]), - ("Soledud Salinos", vec![1]), // actual fuzzy - // ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali'] + ("Soledad", vec![(1, "Salinas")]), + ("Soledud", vec![(1, "River")]), + ("Soledud Salinos", vec![(1, "the")]), // actual fuzzy + // ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali'] ]); // one-word search returning 1 result with 1 page for (search_term, pages) in &test_cases { println!("- [fuzzy] searching {}.", search_term); let t_search_term = &SearchTerm::Fuzzy(search_term.to_string(), 2); - let results = search .search(t_search_term, 0, 10) .unwrap(); + let results = search.search(t_search_term, 0, 10).unwrap(); if !pages.is_empty() { assert!(results.contains_key(TEST_DOC_NAME)); let doc_results = results.get(TEST_DOC_NAME).unwrap(); assert_eq!(pages.len(), doc_results.len()); - for page in pages { + for (page, _) in pages { assert!(doc_results.iter().any(|result| result.page == *page)); } - } else { - assert!(!results.contains_key(TEST_DOC_NAME)); - } - for (_, pages) in &results { - for page in pages { - let preview = match search.get_preview(page, &t_search_term) { + for page in doc_results { + println!( + "Getting preview: {} id:{},{}", + page.page, page.doc_id, page.segment_ord + ); + let page_num: u32 = page.page; + let preview_part = pages + .iter() + .find(|&&(first, _)| first == page_num) + .map_or("pagenotfound", |&(_, part)| part); + let preview = match search.get_preview(page, t_search_term) { Ok(preview) => preview, - Err(_) => "".to_string(), + Err(_) => FUZZY_PREVIEW_NOT_FOUND.to_string(), }; - println!("{}", preview); - assert!(preview.contains(search_term)); + println!( + "Found preview \"{}\" should contain: {}", + preview, preview_part + ); + assert!(preview.contains(preview_part)); + println!("success"); } + } else { + assert!(!results.contains_key(TEST_DOC_NAME)); } - } } From 1b95191210ee5b5c54c27ddefa9e9da7225f8d4b Mon Sep 17 00:00:00 2001 From: fux Date: Tue, 3 Sep 2024 03:35:51 +0200 Subject: [PATCH 05/11] Updates README to inform user about possible jump to the wrong page (zathura) --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index bcf7561..1ce334e 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,23 @@ Use `litt ` to open a document (num refers to the number in brackets, f.e. ![simple example](images/simple_example.png) +**NOTE (open on wrong page):** Possibly zathura (or whatever pdf-reader you're using) will open a +result on the correct page, but then search for the term and (mostly in the case +of fuzzy matching) not find the term on that page but on another. In this case +it might apear like zathura sent you to the wrong page or no result was found on +the page you wanted to open. For zathura: simply enter `:` to go the +the page the result was found. Possibly the searched term was not found by +zathura since it breaks line, e.i: +``` +my- +stifiziert +``` +Try to search for a substring to then find the term on the page. +``` +/my +``` + + ### Exact matching You can search for multiple words, the following will give the same result ``` From e4d0a2770bf5ee75d4f1ca46184d72fb98a448e1 Mon Sep 17 00:00:00 2001 From: fux Date: Tue, 3 Sep 2024 04:31:38 +0200 Subject: [PATCH 06/11] Sovles issue with zathura not finding fuzzy matches - get_preview now also returns the found match and an empty string if no match was found this way, zathura can search for the matched string or not at all (in case of no match found) - Adds tests for mystifiziert, but preview-fuzzy-search does not find Mystifizierung (dist=4) and mystifizierende (dist=5) --- litt/src/main.rs | 23 +++++------------------ litt/tests/tests.rs | 2 +- resources/test.pdf | Bin 382804 -> 392638 bytes search/src/search.rs | 39 +++++++++++++++++++++++++++++---------- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/litt/src/main.rs b/litt/src/main.rs index 2e6f66b..19cf526 100644 --- a/litt/src/main.rs +++ b/litt/src/main.rs @@ -27,18 +27,6 @@ enum LittError { General(String), } -fn get_first_term(query: &str) -> String { - let parts = query.split(' ').collect::>(); - if let Some(first_str) = parts.first() { - if let Some(stripped) = first_str.strip_prefix('\"') { - return stripped.to_string(); - } - first_str.to_string() - } else { - "".to_string() - } -} - fn open_pdf(path: String, page: u32, term: String) -> Result<(), LittError> { let mut cmd = std::process::Command::new("zathura"); cmd.arg(&path) @@ -293,7 +281,6 @@ fn main() -> Result<(), LittError> { }; println!("Found results in {} document(s):", results.len()); let mut fast_store_results: HashMap = HashMap::new(); - let first_query_term = get_first_term(&cli.term); let mut counter = 0; let mut res_counter = 1; for (title, pages) in &results { @@ -306,18 +293,18 @@ fn main() -> Result<(), LittError> { let index_path = index_path.join(title); println!(" ({})", index_path.to_string_lossy().italic()); for page in pages { + let (preview, matched_term) = match search.get_preview(page, &search_term) { + Ok(preview) => preview, + Err(e) => return Err(LittError::General(e.to_string())), + }; fast_store_results.insert( res_counter, ( index_path.to_string_lossy().to_string(), page.page, - first_query_term.clone(), + matched_term, ), ); - let preview = match search.get_preview(page, &search_term) { - Ok(preview) => preview, - Err(e) => return Err(LittError::General(e.to_string())), - }; println!( " - [{}] p.{}: \"{}\", (score: {})", res_counter, diff --git a/litt/tests/tests.rs b/litt/tests/tests.rs index fe3ebc3..0e84a30 100644 --- a/litt/tests/tests.rs +++ b/litt/tests/tests.rs @@ -30,7 +30,7 @@ fn test_index_and_search() { for (title, pages) in &results { assert_eq!(title, TEST_FILE_NAME); for search_result in pages { - let preview = search.get_preview(search_result, &searched_word).unwrap(); + let (preview, _) = search.get_preview(search_result, &searched_word).unwrap(); assert!(!preview.is_empty()); assert!( preview diff --git a/resources/test.pdf b/resources/test.pdf index d55df5a73078f0fb9e3794225eebaab52c7e608b..f16c6f0f4f09eab7af33b87737f8ad17189fe78e 100644 GIT binary patch delta 7612 zcmeHMTWl0n7%sxTY%E4Vs({G>!==J*&vnj>AcAu70);>$F)T6Nc9x5sC@+R6#!KP@#urKYC@qj^FhqS&BJqYK3K9SRpV^)5(w1FXzz{cd&YAho z{O3QH`M&Qz47}g;`GuwzddKzN(mTF)!pE_;lXJDpILoECHO4n;3DtGX2r4$Q#<-vD zB$E+^?~cuquIG7@Y1d0i?kFSRpJqn7NmpCpO3fX~jCPWq^&G{aomW`czwALFlbo}* zOe)*VT2`}p%kGmL8haetaHaKpF6VMSw}B1&D;(^yI~`Dg0H}j75Hw;V?7*=> zhvj>4Kn@{D8@@-KRbM<$a=q>bw*v}iX8&)4ZhM@G1&Bi$jnmw6u6>!B{cy=QAKyR5LVFP^Ax z)eaA|-eTnb<1?BJZY|nMix#!4^0S+JwlIgg#rA>N^yx~;q^GTP!X0Fj)+%W|q1@t| z1F=VMr3Qu>G!%axi0M4mSlfK^`KLFi zldkf!?O3A#spBYAYC?^(*km(Y@bwyloYSdE_c;DtJ)c&Nt=KBL;+a*akX zC<+)qh7m+T6G*-ibw*KF-xMP)LU(FnDac@XiWiT^W=&QVTWtN-y*@xuQJjcPOMdvz zJN@Rh_dc2bWvV&1WWx_jn%O4SvM!hFVag+{YgyanKg|?Cn5T-nPsHY|0az4>gbJH( zU>gO~OkzFUyvne}i&=LMd>uO);|=f|cj?l&hSw`nbG zV|@1P%ODXQSk=K|XPC zI1`nF47n{26-#kk)Xqh#evu(C7L7)(xl2$^~UjJ9yx+=J3{*F z$&>J95nDPy?Zdh(@c?KiKGWTquZ-dNhn$CT7Ae5>QXUR z5>z<<6#bPIRUp$yU{!CK@`Tkbn(};*Y6>?)qz|d5D7I*k$S`_c1>>jTk1XnL2qcpb_G}h(CA{9z1FF##N$t3v3p&oR%qSx7?+>1`#G*EoTP2OgLtY_Lg|u4RzV zwnAnKNv@$C5*(!%+!*U%5i95&->G7Em}esQsY;oo5pcNT$fOijSqn*lTcUNa1r{05 zIyRE^(Ru^{gmG10dX!s{f%uZXe!8WUH9~|9L@POjMS{2J<+=fKNPMJ@31X@Mkq8UW z>i!VcQcBq}I1`FRLDDFzoKT?2AXzM*dcpby$)Xp4Qr8{*GtZgszM~)gezxiM-7gIb zN(deVN{_>CfLzf)hB!1R!MWOyx#Caf36#z}N5$ENLGc6?zr08g<--|TMap=M;HKd9 zuNVgqzq#hxQ zuxPHv8510}-6S|OdSz6TFv^dtjH=LuV5HH zr74w7H6PluqV>p}(+f|&b?%#^!(tVp2w~M~fGNN#0xYj?74_>r!lpl-Y`7Ct)SpKY zqGo42ZalXvqlo!R@s6P+ssW?4xk378Y%vNnPZ_rWq!6d-6j+hI3J<0#^^)Y-YXN-M z1NXEU5%<7c+IZn9t#Q=Nq_6%PxJSw_5|E?;3H-)J{w+wse{%)rHrEVcQ3KAL?C{H= zwa8DzrY8>A?hjr$J^r`P9$U%ZebxBG;VL0tHLil!db88>eT1ujooz?}u3kJsxH{33 J$`xf?{TC|)M_>Q| delta 19 acmdn@Tl~s8@rD-07N!>F7M3lnA=d#{!UyO8 diff --git a/search/src/search.rs b/search/src/search.rs index 569611d..b4fc66f 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -45,6 +45,18 @@ pub enum SearchTerm { Exact(String), } +fn get_first_term(query: &str) -> String { + let parts = query.split(' ').collect::>(); + if let Some(first_str) = parts.first() { + if let Some(stripped) = first_str.strip_prefix('\"') { + return stripped.to_string(); + } + first_str.to_string() + } else { + "".to_string() + } +} + impl Search { pub fn new(index: Index, schema: SearchSchema) -> Self { Self { index, schema } @@ -132,7 +144,7 @@ impl Search { &self, search_result: &SearchResult, search_term: &SearchTerm, - ) -> Result { + ) -> Result<(String, String)> { // Prepare creating snippet. let searcher = self .index @@ -160,17 +172,19 @@ impl Search { match search_term { SearchTerm::Fuzzy(term, distance) => { for t in term.split(" ").collect::>() { - if let Ok(prev) = self.get_fuzzy_preview(path, t, distance, &text) { - return Ok(prev); + if let Ok((prev, matched_term)) = self.get_fuzzy_preview(path, t, distance, &text) { + return Ok((prev, matched_term.to_string())); } } - Ok(FUZZY_PREVIEW_NOT_FOUND.to_string()) + Ok((FUZZY_PREVIEW_NOT_FOUND.to_string(), "".to_string())) // return empty string so + // that zathura does not + // search } - SearchTerm::Exact(term) => self.get_preview_from_query(term, text), + SearchTerm::Exact(term) => self.get_preview_from_query(term, text) } } - fn get_preview_from_query(&self, term: &str, text: String) -> Result { + fn get_preview_from_query(&self, term: &str, text: String) -> Result<(String, String)> { let searcher = self .index .searcher() @@ -187,7 +201,7 @@ impl Search { let snippet = snippet_generator.snippet(&text); // let snippet = snippet_generator.snippet_from_doc(&retrieved_doc); - Ok(self.highlight(snippet)) + Ok((self.highlight(snippet), get_first_term(term))) } fn get_fuzzy_preview( @@ -196,7 +210,7 @@ impl Search { term: &str, distance: &u8, body: &str, - ) -> Result { + ) -> Result<(String, String)> { let pindex: PageIndex = self .index .page_index(path) @@ -219,7 +233,7 @@ impl Search { let substring = substring .to_string() .replace(&matched_term, &format!("**{}**", matched_term)); - Ok(substring.replace('\n', " ")) + Ok((substring.replace('\n', " "), matched_term)) } fn get_fuzzy_match( @@ -237,6 +251,7 @@ impl Search { for (word, matches) in pindex { let dist: usize = levenshtein(term, &word); let dist = if word.contains(term) { 1 } else { dist }; + println!("{} ~ {} = {}", term, word, dist); if dist < min_dist { min_dist = dist; let (start, end) = matches.first().unwrap_or(&(0, 0)); @@ -266,6 +281,7 @@ impl Search { result.push_str(&snippet.fragment()[start_from..]); result.replace('\n', " ") } + } #[cfg(test)] @@ -331,6 +347,8 @@ mod tests { ("\"limbs branches\"", vec![]), ("\"limbs branches\"~1", vec![2]), ("\"of Sole\"*", vec![1]), + ("Mystifizierung", vec![1, 2]), + ("Mystifizierungen", vec![1]), ]); // one-word search returning 1 result with 1 page for (search_term, pages) in &test_cases { @@ -365,6 +383,7 @@ mod tests { ("Soledud", vec![(1, "River")]), ("Soledud Salinos", vec![(1, "the")]), // actual fuzzy // ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali'] + ("mystifiziert", vec![(1, "Mystifizierung"), (2, "No preview")]), ]); // one-word search returning 1 result with 1 page for (search_term, pages) in &test_cases { @@ -389,7 +408,7 @@ mod tests { .find(|&&(first, _)| first == page_num) .map_or("pagenotfound", |&(_, part)| part); let preview = match search.get_preview(page, t_search_term) { - Ok(preview) => preview, + Ok((preview, _)) => preview, Err(_) => FUZZY_PREVIEW_NOT_FOUND.to_string(), }; println!( From 44fef5bb62edff72a70abc1ab51a34d041777876 Mon Sep 17 00:00:00 2001 From: fux Date: Thu, 5 Sep 2024 13:12:16 +0200 Subject: [PATCH 07/11] Fixes formatting --- search/src/search.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/search/src/search.rs b/search/src/search.rs index b4fc66f..ca40060 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -172,7 +172,9 @@ impl Search { match search_term { SearchTerm::Fuzzy(term, distance) => { for t in term.split(" ").collect::>() { - if let Ok((prev, matched_term)) = self.get_fuzzy_preview(path, t, distance, &text) { + if let Ok((prev, matched_term)) = + self.get_fuzzy_preview(path, t, distance, &text) + { return Ok((prev, matched_term.to_string())); } } @@ -180,7 +182,7 @@ impl Search { // that zathura does not // search } - SearchTerm::Exact(term) => self.get_preview_from_query(term, text) + SearchTerm::Exact(term) => self.get_preview_from_query(term, text), } } @@ -281,7 +283,6 @@ impl Search { result.push_str(&snippet.fragment()[start_from..]); result.replace('\n', " ") } - } #[cfg(test)] @@ -382,8 +383,11 @@ mod tests { ("Soledad", vec![(1, "Salinas")]), ("Soledud", vec![(1, "River")]), ("Soledud Salinos", vec![(1, "the")]), // actual fuzzy - // ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali'] - ("mystifiziert", vec![(1, "Mystifizierung"), (2, "No preview")]), + // ("Sole AND Sali", vec![1]), // Does not work: searching for ['sole' 'and', 'sali'] + ( + "mystifiziert", + vec![(1, "Mystifizierung"), (2, "No preview")], + ), ]); // one-word search returning 1 result with 1 page for (search_term, pages) in &test_cases { From 45b667ab4cff34524a9d9695693ef2b5bd726f40 Mon Sep 17 00:00:00 2001 From: fux Date: Sun, 6 Oct 2024 01:39:00 +0200 Subject: [PATCH 08/11] uses immutable path and adds extension, instead of replacing existing extension on mutable path Co-authored-by: SimonThormeyer <49559340+SimonThormeyer@users.noreply.github.com> --- index/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/src/index.rs b/index/src/index.rs index 1dbc681..e84e1f5 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -489,10 +489,10 @@ impl Index { } } - fn create_page_index(path: &mut PathBuf, body: &str) -> Result<()> { + fn create_page_index(path: &Path, body: &str) -> Result<()> { // Create reversed index map let pindex: PageIndex = Self::split_text_into_words(body)?; - path.set_extension("pageindex"); + let path = path.with_extension(".pageindex"); let json_str = serde_json::to_string(&pindex).map_err(|e| CreationError(e.to_string()))?; std::fs::write(path, json_str).map_err(|e| CreationError(e.to_string()))?; Ok(()) From 58b2973a069bd707f188cb17a68ebdb73217a313 Mon Sep 17 00:00:00 2001 From: fux Date: Sun, 6 Oct 2024 02:05:11 +0200 Subject: [PATCH 09/11] Revert "uses immutable path and adds extension, instead of replacing existing extension on mutable path" This reverts commit 45b667ab4cff34524a9d9695693ef2b5bd726f40. --- index/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/src/index.rs b/index/src/index.rs index e84e1f5..1dbc681 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -489,10 +489,10 @@ impl Index { } } - fn create_page_index(path: &Path, body: &str) -> Result<()> { + fn create_page_index(path: &mut PathBuf, body: &str) -> Result<()> { // Create reversed index map let pindex: PageIndex = Self::split_text_into_words(body)?; - let path = path.with_extension(".pageindex"); + path.set_extension("pageindex"); let json_str = serde_json::to_string(&pindex).map_err(|e| CreationError(e.to_string()))?; std::fs::write(path, json_str).map_err(|e| CreationError(e.to_string()))?; Ok(()) From 3a1d0cfae91ce0f1e6f15fd559311e5097f61da2 Mon Sep 17 00:00:00 2001 From: fux Date: Sun, 6 Oct 2024 02:42:18 +0200 Subject: [PATCH 10/11] Fixes minor errors and suggestions from PR review --- index/src/index.rs | 15 +++++++-------- search/src/search.rs | 3 +-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/index/src/index.rs b/index/src/index.rs index 1dbc681..ccaaf7c 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -370,7 +370,7 @@ impl Index { let page_body = std::fs::read_to_string(&page_path) .map_err(|e| PdfParseError(e.to_string()))?; self.add_page(dir_entry.path(), page_number, &page_path, &page_body)?; - Self::create_page_index(&mut page_path.clone(), &page_body)?; + Self::store_page_index(&mut page_path.clone(), Self::create_page_index(&page_body)?)?; } } @@ -399,7 +399,7 @@ impl Index { .map_err(|e| TxtParseError(e.to_string() + full_path.to_string_lossy().as_ref()))?; // Finally, add page self.add_page(dir_entry.path(), page_number, &page_path, &body)?; - Self::create_page_index(&mut page_path.clone(), &body)?; + Self::store_page_index(&mut page_path.clone(), Self::create_page_index(&body)?)?; Ok(page_number) } @@ -489,16 +489,15 @@ impl Index { } } - fn create_page_index(path: &mut PathBuf, body: &str) -> Result<()> { + fn store_page_index(path: &Path, pindex: PageIndex) -> Result<()> { // Create reversed index map - let pindex: PageIndex = Self::split_text_into_words(body)?; - path.set_extension("pageindex"); + let path = path.with_extension("pageindex"); let json_str = serde_json::to_string(&pindex).map_err(|e| CreationError(e.to_string()))?; std::fs::write(path, json_str).map_err(|e| CreationError(e.to_string()))?; Ok(()) } - fn split_text_into_words(body: &str) -> Result { + fn create_page_index(body: &str) -> Result { let mut pindex: PageIndex = HashMap::new(); let mut i = 0; let graphemes: Vec<&str> = body.graphemes(true).collect(); @@ -597,9 +596,9 @@ mod tests { } #[test] - fn test_split_text_into_words() { + fn test_() { let text = "Hello*&%&^%, beautiful\n\rWörld!"; - let result = Index::split_text_into_words(text).unwrap_or_default(); + let result = Index::create_page_index(text).unwrap_or_default(); assert!(result.contains_key("Hello")); assert!(result.contains_key("beautiful")); assert!(result.contains_key("Wörld")); diff --git a/search/src/search.rs b/search/src/search.rs index ca40060..4c53122 100644 --- a/search/src/search.rs +++ b/search/src/search.rs @@ -251,8 +251,7 @@ impl Search { let mut cur: (String, u32, u32) = ("".to_string(), 0, 0); let mut min_dist: usize = usize::MAX; for (word, matches) in pindex { - let dist: usize = levenshtein(term, &word); - let dist = if word.contains(term) { 1 } else { dist }; + let dist: usize = if word.contains(term) { 1 } else { levenshtein(term, &word)}; println!("{} ~ {} = {}", term, word, dist); if dist < min_dist { min_dist = dist; From 07bd5f94e62ab761a07ec1adf02f7f0c0880686e Mon Sep 17 00:00:00 2001 From: fux Date: Sun, 6 Oct 2024 02:50:22 +0200 Subject: [PATCH 11/11] Fixes clippy --- index/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/src/index.rs b/index/src/index.rs index ccaaf7c..fd3c325 100644 --- a/index/src/index.rs +++ b/index/src/index.rs @@ -370,7 +370,7 @@ impl Index { let page_body = std::fs::read_to_string(&page_path) .map_err(|e| PdfParseError(e.to_string()))?; self.add_page(dir_entry.path(), page_number, &page_path, &page_body)?; - Self::store_page_index(&mut page_path.clone(), Self::create_page_index(&page_body)?)?; + Self::store_page_index(&page_path.clone(), Self::create_page_index(&page_body)?)?; } } @@ -399,7 +399,7 @@ impl Index { .map_err(|e| TxtParseError(e.to_string() + full_path.to_string_lossy().as_ref()))?; // Finally, add page self.add_page(dir_entry.path(), page_number, &page_path, &body)?; - Self::store_page_index(&mut page_path.clone(), Self::create_page_index(&body)?)?; + Self::store_page_index(&page_path.clone(), Self::create_page_index(&body)?)?; Ok(page_number) }