diff --git a/kolomoni/src/state.rs b/kolomoni/src/state.rs index 407b16b..243cf7e 100644 --- a/kolomoni/src/state.rs +++ b/kolomoni/src/state.rs @@ -12,17 +12,28 @@ use tokio::sync::mpsc; use crate::connect_and_set_up_database; -pub struct KolomoniSearchInner { +/// A dictionary search engine. +/// +/// Handles searching, seeding and incrementally updating the internal index and cache. +pub struct KolomoniSearch { pub engine: KolomoniSearchEngine, change_sender: mpsc::Sender, } -impl KolomoniSearchInner { +impl KolomoniSearch { + /// Run a fuzzy word search with the given `word_search_query`. + /// Returns a list of both slovene and english search results. #[inline] pub async fn search(&self, word_search_query: &str) -> Result { self.engine.search(word_search_query).await } + /// Signals to the search indexer that an english word has been created or updated. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// The indexing (and caching) of the created or updated word will be performed in + /// a separate async task as soon as the receiver can pick it up, which will very likely be in + /// less than a second after sending. #[inline] pub async fn signal_english_word_created_or_updated(&self, word_uuid: Uuid) -> Result<()> { self.change_sender @@ -32,6 +43,11 @@ impl KolomoniSearchInner { .wrap_err("Failed to send \"english word created/updated\" event.") } + /// Signals to the search indexer that an english word has been removed from the database. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// Removal from index and cache will be performed in a separate async task as soon + /// as the receiver can pick it up, which will very likely be in less than a second after sending. #[inline] pub async fn signal_english_word_removed(&self, word_uuid: Uuid) -> Result<()> { self.change_sender @@ -42,6 +58,12 @@ impl KolomoniSearchInner { } + /// Signals to the search indexer that a slovene word has been created or updated. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// The indexing (and caching) of the created or updated word will be performed in + /// a separate async task as soon as the receiver can pick it up, which will very likely be in + /// less than a second after sending. #[inline] pub async fn signal_slovene_word_created_or_updated(&self, word_uuid: Uuid) -> Result<()> { self.change_sender @@ -51,6 +73,11 @@ impl KolomoniSearchInner { .wrap_err("Failed to send \"slovene word created/updated\" event.") } + /// Signals to the search indexer that a slovene word has been removed from the database. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// Removal from index and cache will be performed in a separate async task as soon + /// as the receiver can pick it up, which will very likely be in less than a second after sending. #[inline] pub async fn signal_slovene_word_removed(&self, word_uuid: Uuid) -> Result<()> { self.change_sender @@ -61,6 +88,12 @@ impl KolomoniSearchInner { } + /// Signals to the search indexer that a category has been created or updated. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// The indexing (and caching) of the created or updated category will be performed in + /// a separate async task as soon as the receiver can pick it up, which will very likely be in + /// less than a second after sending. #[inline] pub async fn signal_category_created_or_updated(&self, category_id: i32) -> Result<()> { self.change_sender @@ -70,6 +103,11 @@ impl KolomoniSearchInner { .wrap_err("Failed to send \"category created/updated\" event.") } + /// Signals to the search indexer that a category has been removed from the database. + /// + /// This method does not block unless the communication channel is full (which is unlikely). + /// Removal from index and cache will be performed in a separate async task as soon + /// as the receiver can pick it up, which will very likely be in less than a second after sending. #[inline] pub async fn signal_category_removed(&self, category_id: i32) -> Result<()> { self.change_sender @@ -104,7 +142,7 @@ pub struct ApplicationStateInner { /// Authentication token manager (JSON Web Token). pub jwt_manager: JsonWebTokenManager, - pub search: KolomoniSearchInner, + pub search: KolomoniSearch, } impl ApplicationStateInner { @@ -117,7 +155,7 @@ impl ApplicationStateInner { let engine = KolomoniSearchEngine::new(&configuration).await?; let sender = engine.change_event_sender(); - KolomoniSearchInner { + KolomoniSearch { engine, change_sender: sender, } diff --git a/kolomoni_database/src/lib.rs b/kolomoni_database/src/lib.rs index 58e20d5..8d6cb29 100644 --- a/kolomoni_database/src/lib.rs +++ b/kolomoni_database/src/lib.rs @@ -1,3 +1,5 @@ +#![allow(rustdoc::private_intra_doc_links)] + //! This crate contains raw database entities in combination //! with the "business logic", i.e. query and mutation methods for them. //! diff --git a/kolomoni_search/src/cache.rs b/kolomoni_search/src/cache.rs index 3f62ad4..2240ac2 100644 --- a/kolomoni_search/src/cache.rs +++ b/kolomoni_search/src/cache.rs @@ -14,6 +14,7 @@ new_key_type! { struct SloveneWordSlotMapKey; } new_key_type! { struct CategorySlotMapKey; } +/// An english word present in the search indexer cache. #[derive(Clone, PartialEq, Eq, Debug)] pub struct CachedEnglishWord { /// Base english word. @@ -30,6 +31,11 @@ pub struct CachedEnglishWord { } impl CachedEnglishWord { + /// Obtain a [`CachedEnglishWord`] given an [`ExpandedEnglishWordInfo`] + /// (which you can get by fetching info from the database) and access to the indexer cache. + /// + /// `None` can be returned if the word has invalid links to slovene words or categories + /// (i.e. when the linked words/categories don't exist in the cache yet). pub fn from_expanded_database_info( expanded_info: ExpandedEnglishWordInfo, cache: &KolomoniEntityCache, @@ -76,10 +82,15 @@ impl CachedEnglishWord { }) } + /// Obtain the [`Uuid`] associated with this english word. pub fn uuid(&self) -> &Uuid { &self.word.word_id } + /// Convert this [`CachedEnglishWord`] back into an [`ExpandedEnglishWordInfo`]. + /// + /// This will require access to the slot maps of the indexer cache + /// (which we need to convert weak slovene word / category references back into their explicit form). fn into_expanded_word_info( self, slot_context: &EntitySlotMapContext, @@ -143,6 +154,7 @@ impl CachedEnglishWord { +/// A slovene word present in the search indexer cache. #[derive(Clone, PartialEq, Eq, Debug)] pub struct CachedSloveneWord { pub word: entities::word_slovene::Model, @@ -151,6 +163,11 @@ pub struct CachedSloveneWord { } impl CachedSloveneWord { + /// Obtain a [`CachedSloveneWord`] given an [`ExpandedSloveneWordInfo`] + /// (which you can get by fetching info from the database) and access to the indexer cache. + /// + /// `None` can be returned if the word has invalid links to categories + /// (i.e. when the categories don't exist in the cache yet). pub fn from_expanded_database_info( expanded_info: ExpandedSloveneWordInfo, cache: &KolomoniEntityCache, @@ -172,10 +189,15 @@ impl CachedSloveneWord { }) } + /// Obtain the [`Uuid`] associated with this sloveney word. pub fn uuid(&self) -> &Uuid { &self.word.word_id } + /// Convert this [`CachedSloveneWord`] back into an [`ExpandedSloveneWordInfo`]. + /// + /// This will require access to the slot maps of the indexer cache + /// (which we need to convert weak category references back into their explicit form). fn into_expanded_word_info( self, slot_context: &EntitySlotMapContext, @@ -199,6 +221,7 @@ impl CachedSloveneWord { +/// A category present in the search indexer cache. #[derive(Clone, PartialEq, Eq, Debug)] pub struct CachedCategory { pub category: entities::category::Model, @@ -219,6 +242,10 @@ impl CachedCategory { } +/// An internal context struct that we use to pass a set of +/// accesses to [`SlotMap`]s residing inside [`KolomoniEntityCache`]. +/// +/// This is an implementation detail and not visible externally. struct EntitySlotMapContext<'e, 's, 'c> { #[allow(dead_code)] english_word_slot_map: &'e SlotMap, @@ -229,6 +256,16 @@ struct EntitySlotMapContext<'e, 's, 'c> { } +/// A cache containing english and slovene words as well as word categories. +/// +/// All entities are stored using [`SlotMap`]s, allowing us to establish weak links between them. +/// +/// For example, this allow us to simply hold weak slot map keys inside the [`translations`][CachedEnglishWord::translations] +/// `Vec` field of the english word. +/// This means that each english word weakly references corresponding slovene words +/// that are linked as translations instead of holding their entire information in themselves, improving memory usage. +/// Another upside to this approach is that modifying a slovene word is immediately reflected in all english words +/// it is a translation of, simplifying word updates. pub struct KolomoniEntityCache { english_word_slot_map: SlotMap, english_word_uuid_to_key_map: HashMap, @@ -241,6 +278,7 @@ pub struct KolomoniEntityCache { } impl KolomoniEntityCache { + /// Initialize an empty entity cache. pub fn new() -> Self { let english_word_slot_map = SlotMap::::with_key(); let english_word_uuid_to_key_map = HashMap::new(); @@ -262,6 +300,7 @@ impl KolomoniEntityCache { } } + /// Clear the entity cache. pub fn clear(&mut self) { self.english_word_slot_map.clear(); self.english_word_uuid_to_key_map.clear(); @@ -273,6 +312,7 @@ impl KolomoniEntityCache { self.category_id_to_key_map.clear(); } + fn slot_context(&self) -> EntitySlotMapContext { EntitySlotMapContext { english_word_slot_map: &self.english_word_slot_map, @@ -282,6 +322,7 @@ impl KolomoniEntityCache { } + /// Obtain an [`ExpandedEnglishWordInfo`] given its [`Uuid`], if present in the cache and if containing valid connections. pub fn english_word(&self, word_uuid: Uuid) -> Option { let Some(english_word_slot_map_key) = self.english_word_uuid_to_key_map.get(&word_uuid) else { @@ -302,11 +343,16 @@ impl KolomoniEntityCache { .into_expanded_word_info(&self.slot_context()) } + /// Obtain an [`EnglishWordSlotMapKey`] slot map key given its [`Uuid`], if present in the cache. #[allow(dead_code)] fn english_word_key(&self, word_uuid: Uuid) -> Option { self.english_word_uuid_to_key_map.get(&word_uuid).copied() } + /// Insert (or overwrite) an english word into the entity cache. + /// + /// For conversion from [`ExpandedEnglishWordInfo`] to [`CachedEnglishWord`], + /// see [`CachedEnglishWord::from_expanded_database_info`]. pub fn insert_or_update_english_word(&mut self, english_word: CachedEnglishWord) { let word_uuid = *english_word.uuid(); @@ -336,6 +382,9 @@ impl KolomoniEntityCache { self.english_word_uuid_to_key_map.insert(word_uuid, new_key); } + /// Remove an english word from the entity cache. + /// + /// Returns `Err(())` if the word wasn't present if the cache. pub fn remove_english_word(&mut self, word_uuid: Uuid) -> Result<(), ()> { let Some(english_word_slot_map_key) = self.english_word_uuid_to_key_map.get(&word_uuid) else { @@ -358,6 +407,7 @@ impl KolomoniEntityCache { } + /// Obtain an [`ExpandedSloveneWordInfo`] given its [`Uuid`], if present in the cache and if containing valid connections. pub fn slovene_word(&self, word_uuid: Uuid) -> Option { let Some(slovene_word_slot_map_key) = self.slovene_word_uuid_to_key_map.get(&word_uuid) else { @@ -378,10 +428,15 @@ impl KolomoniEntityCache { .into_expanded_word_info(&self.slot_context()) } + /// Obtain an [`SloveneWordSlotMapKey`] slot map key given its [`Uuid`], if present in the cache. fn slovene_word_key(&self, word_uuid: Uuid) -> Option { self.slovene_word_uuid_to_key_map.get(&word_uuid).copied() } + /// Insert (or overwrite) a slovene word into the entity cache. + /// + /// For conversion from [`ExpandedSloveneWordInfo`] to [`CachedSloveneWord`], + /// see [`CachedSloveneWord::from_expanded_database_info`]. pub fn insert_or_update_slovene_word(&mut self, slovene_word: CachedSloveneWord) { let word_uuid = *slovene_word.uuid(); @@ -411,6 +466,9 @@ impl KolomoniEntityCache { self.slovene_word_uuid_to_key_map.insert(word_uuid, new_key); } + /// Remove a slovene word from the entity cache. + /// + /// Returns `Err(())` if the word wasn't present if the cache. pub fn remove_slovene_word(&mut self, word_uuid: Uuid) -> Result<(), ()> { let Some(slovene_word_slot_map_key) = self.slovene_word_uuid_to_key_map.get(&word_uuid) else { @@ -433,6 +491,7 @@ impl KolomoniEntityCache { } + /// Obtain information about a category given its ID, if present in the cache. #[allow(dead_code)] pub fn category(&self, category_id: i32) -> Option { let Some(category_slot_map_key) = self.category_id_to_key_map.get(&category_id) else { @@ -448,10 +507,15 @@ impl KolomoniEntityCache { Some(cached_category.clone().into_inner()) } + /// Obtain a [`CategorySlotMapKey`] slot map key given the category's ID, if present in the cache. fn category_key(&self, category_id: i32) -> Option { self.category_id_to_key_map.get(&category_id).copied() } + /// Insert (or overwrite) a category into the entity cache. + /// + /// For conversion from the database category model to [`CachedCategory`], + /// see [`CachedCategory::from_database_model`]. pub fn insert_or_update_category(&mut self, category: CachedCategory) { let category_id = category.id(); @@ -481,6 +545,9 @@ impl KolomoniEntityCache { self.category_id_to_key_map.insert(category_id, new_key); } + /// Remove a category from the entity cache. + /// + /// Returns `Err(())` if the category wasn't present if the cache. pub fn remove_category(&mut self, category_id: i32) -> Result<(), ()> { let Some(category_slot_map_key) = self.category_id_to_key_map.get(&category_id) else { return Err(()); diff --git a/kolomoni_search/src/lib.rs b/kolomoni_search/src/lib.rs index afb5bf2..e439dcf 100644 --- a/kolomoni_search/src/lib.rs +++ b/kolomoni_search/src/lib.rs @@ -40,7 +40,10 @@ use uuid::Uuid; mod cache; -/// Indexed language type. +/// Specialized language type enum used for storage in the word index. +/// +/// **Do not use outside [`kolomoni_search`][crate]! +/// Use [`WordLanguage`][kolomoni_database::shared::WordLanguage] instead!** #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum IndexedWordLanguage { Slovene, @@ -69,33 +72,47 @@ impl IndexedWordLanguage { } +/// Represents a single english or slovene word search result. pub enum SearchResult { English(ExpandedEnglishWordInfo), Slovene(ExpandedSloveneWordInfo), } +/// Represents a set of search results. pub struct SearchResults { + /// Words that fuzzily match the given search query. pub words: Vec, } -// TODO Next up: need a handler for create/update and delete operations on words and categories -// so we can keep the cache and index up-to-date as the changes get made. - - +/// A change event in relation to english words, slovene words and categories. +/// +/// The variants of this enum are used as a message that is sent to the [`WordIndexChangeHandler`] +/// in order to signal that something has changed in the database and needs to be reindexed/recached. #[derive(Clone, PartialEq, Eq, Debug)] pub enum ChangeEvent { + /// English word has been created or updated. EnglishWordCreatedOrUpdated { word_uuid: Uuid }, + + /// English word has been removed. EnglishWordRemoved { word_uuid: Uuid }, + + /// Slovene word has been created or updated. SloveneWordCreatedOrUpdated { word_uuid: Uuid }, + + /// Slovene word has been removed. SloveneWordRemoved { word_uuid: Uuid }, + + /// Word category has been created or updated. CategoryCreatedOrUpdated { category_id: i32 }, + + /// Word category has been removed. CategoryRemoved { category_id: i32 }, } +/// A pub struct WordIndexChangeHandler { - // TODO Implement sending these events from application state and implement receiving the messages and updating the index/cache. sender: mpsc::Sender, receiver_task_handle: Mutex>>>, @@ -106,8 +123,9 @@ pub struct WordIndexChangeHandler { } impl WordIndexChangeHandler { - // TODO - + /// Initialize a new [`WordIndexChangeHandler`], starting a background async task + /// that will handle [`ChangeEvent`]s after they are sent through the associated event channel + /// (see [`Self::sender`]). pub(crate) async fn new( inner: Arc>, database: DatabaseConnection, @@ -129,10 +147,26 @@ impl WordIndexChangeHandler { arc_self } + /// Obtain a multi-producer, single-consumer [`Sender`][mpsc::Sender] in order + /// to be able to send [`ChangeEvent`]s to the incremental indexer. This is a bounded sender, + /// which means that sending through it *can* block (but it likely won't). + /// + /// For example: when an english word is created as a result of e.g. an API call + /// to the backend, the backend should signal to the indexer via this `Sender` that + /// it needs to process the new word. + /// + /// In reality, sending things through this channel is abstracted away inside the + /// [`KolomoniSearch`](../kolomoni/state/struct.KolomoniSearchInner.html) struct + /// by using the `signal_*` methods. pub fn sender(&self) -> mpsc::Sender { self.sender.clone() } + /// The main [`ChangeEvent`] receiver loop. This task is spawned inside [`Self::new`]. + /// + /// The job of this function is to simply process incoming [`ChangeEvent`] and + /// incrementally update the index and cache. The processing of each posssible message + /// is then delegated to a corresponding `on_*` method on this struct. async fn receiver_loop( self: Arc, mut receiver: mpsc::Receiver, @@ -432,6 +466,7 @@ pub(crate) struct WordIndexSchemaFields { } +/// Construct a [`tantivy`] [`Schema`] and its fields that we need for a word dictionary index. fn construct_indexing_schema() -> (Schema, WordIndexSchemaFields) { let mut word_schema_builder = Schema::builder(); @@ -476,8 +511,9 @@ fn construct_indexing_schema() -> (Schema, WordIndexSchemaFields) { - -async fn clear_index_and_cache(inner: &mut RwLockWriteGuard<'_, WordIndexInner>) -> Result<()> { +/// Given mutable access to [`WordIndexInner`], this function +/// clears the dictionary index and the cache. +async fn clear_index_and_cache(inner: &mut WordIndexInner) -> Result<()> { // Clear existing index. { let mut index_writer = inner @@ -502,7 +538,6 @@ async fn clear_index_and_cache(inner: &mut RwLockWriteGuard<'_, WordIndexInner>) Ok(()) } - /* /// Reindex and cache words that have been modified since the last call /// to [`refresh_modified_words`] or [`initialize_with_fresh_words`]. @@ -637,7 +672,7 @@ async fn refresh_modified_entities(index_inner: &Arc>) -> - +/// Internal search engine implementation. pub(crate) struct WordIndexInner { word_index: Index, @@ -649,7 +684,7 @@ pub(crate) struct WordIndexInner { /// A search engine implementation for Stari Kolomoni. /// -/// Allows fuzzy matching with a maximum Levenshtein distance of 2. +/// Allows per-term fuzzy matching with a maximum Levenshtein distance of 2. pub struct KolomoniSearchEngine { change_handler: Arc, @@ -719,11 +754,23 @@ impl KolomoniSearchEngine { }) } + /// Obtain a multi-producer, single-consumer [`Sender`][mpsc::Sender] in order + /// to be able to send [`ChangeEvent`]s to the incremental indexer. This is a bounded sender, + /// which means that sending through it *can* block (but it likely won't). + /// + /// For example: when an english word is created as a result of e.g. an API call + /// to the backend, the backend should signal to the indexer via this `Sender` that + /// it needs to process the new word. + /// + /// In reality, sending things through this channel is abstracted away inside the + /// [`KolomoniSearch`](../kolomoni/state/struct.KolomoniSearchInner.html) struct + /// by using the `signal_*` methods. pub fn change_event_sender(&self) -> mpsc::Sender { self.change_handler.sender() } /// Returns matching english and slovene words for the given search query. + /// /// Does not perform any database lookups, and instead relies on the index and cache being up-to-date. pub async fn search(&self, word_search_query: &str) -> Result { let inner = self.inner.read().await; @@ -823,7 +870,7 @@ impl KolomoniSearchEngine { } - /// Clear the index and cache and refresh their contents from a full database scan. + /// Clear the index (and cache) and then seed them from a full database scan. pub async fn initialize_with_fresh_entries(&mut self) -> Result<()> { let mut inner = self.inner.write().await; @@ -905,7 +952,7 @@ impl KolomoniSearchEngine { // TODO If (or when) the database will start to contain more complex references // (e.g. english words linked to other english words), we'll need to modify this approach: - // we'll create a queue and try to insert entities into the cache until no elements are remaining. + // we'll need weak links between entries, allowing us to insert an entity whose related entities aren't present in the cache yet. let cached_word_entry = CachedSloveneWord::from_expanded_database_info(slovene_word_info, &inner.cache) .expect("failed to convert expanded slovene word info into a cached word");