diff --git a/alembic/versions/2023-04-13-06.47.04_b3dbb554ba53_postgres_fuzzy_search.py b/alembic/versions/2023-04-13-06.47.04_b3dbb554ba53_postgres_fuzzy_search.py new file mode 100644 index 0000000000..159087bc86 --- /dev/null +++ b/alembic/versions/2023-04-13-06.47.04_b3dbb554ba53_postgres_fuzzy_search.py @@ -0,0 +1,89 @@ +"""postgres fuzzy search + +Revision ID: b3dbb554ba53 +Revises: 38514b39a824 +Create Date: 2023-04-13 06:47:04.617131 + +""" +import sqlalchemy as sa + +import mealie.db.migration_types +from alembic import op +import alembic.context as context +from mealie.core.config import get_app_settings + +# revision identifiers, used by Alembic. +revision = "b3dbb554ba53" +down_revision = "38514b39a824" +branch_labels = None +depends_on = None + + +def get_db_type(): + return op.get_context().dialect.name + + +def setup_postgres_trigrams(): + op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;") + op.create_index( + "ix_recipes_name_normalized_gin", + table_name="recipes", + columns=["name_normalized"], + unique=False, + postgresql_using="gin", + postgresql_ops={ + "name_normalized": "gin_trgm_ops", + }, + ) + op.create_index( + "ix_recipes_description_normalized_gin", + table_name="recipes", + columns=["description_normalized"], + unique=False, + postgresql_using="gin", + postgresql_ops={ + "description_normalized": "gin_trgm_ops", + }, + ) + op.create_index( + "ix_recipes_ingredients_note_normalized_gin", + table_name="recipes_ingredients", + columns=["note_normalized"], + unique=False, + postgresql_using="gin", + postgresql_ops={ + "note_normalized": "gin_trgm_ops", + }, + ) + op.create_index( + "ix_recipes_ingredients_original_text_normalized_gin", + table_name="recipes_ingredients", + columns=["original_text_normalized"], + unique=False, + postgresql_using="gin", + postgresql_ops={ + "original_text_normalized": "gin_trgm_ops", + }, + ) + + +def remove_postgres_trigrams(): + op.execute("DROP EXTENSION IF EXISTS pg_trgm;") + op.drop_index("ix_recipes_name_normalized_gin", table_name="recipe") + op.drop_index("ix_recipes_description_normalized_gin", table_name="recipe") + op.drop_index("ix_recipes_ingredients_note_normalized_gin", table_name="recipes_ingredients") + op.drop_index("ix_recipes_ingredients_original_text_normalized_gin", table_name="recipes_ingredients") + + +def upgrade(): + if get_db_type() == "postgresql": + setup_postgres_trigrams() + else: + pass + + +def downgrade(): + if get_db_type() == "postgres": + remove_postgres_trigrams() + else: + pass diff --git a/docs/docs/documentation/getting-started/faq.md b/docs/docs/documentation/getting-started/faq.md index 7b1bf96a11..591e6362bf 100644 --- a/docs/docs/documentation/getting-started/faq.md +++ b/docs/docs/documentation/getting-started/faq.md @@ -68,6 +68,13 @@ Yes, you can install Mealie on your local machine. HOWEVER, it is recommended th - [Advanced Installation](../installation/advanced/) +## What is fuzzy search and how do I use it? +Mealie can use fuzzy search, which is robust to minor typos. For example, searching for "brocolli" will still find your recipe for "broccoli soup". But fuzzy search is only functional on a Postgres database backend. To enable fuzzy search you will need to migrate to Postgres: + +1. Backup your database and download the .zip file (same as when [migrating](./migrating-to-mealie-v1.md)) +2. Set up a [Postgres](./installation/postgres.md) instance of Mealie +3. Upload the backup .zip and click to apply it (as as migration) + ## How i can attach an image or video to a Recipe? Yes. Mealie's Recipe Steps and other fields support the markdown syntax and therefor supports images and videos. To attach an image to the recipe, you can upload it as an asset and use the provided copy button to generate the html image tag required to render the image. For videos, Mealie provides no way to host videos. You'll need to host your videos with another provider and embed them in your recipe. Generally, the video provider will provide a link to the video and the html tag required to render the video. For example, youtube provides the following link that works inside a step. You can adjust the width and height attributes as necessary to ensure a fit. diff --git a/mealie/db/init_db.py b/mealie/db/init_db.py index 30f67e55de..90dd91a8a2 100644 --- a/mealie/db/init_db.py +++ b/mealie/db/init_db.py @@ -92,6 +92,9 @@ def main(): logger.info("Migration needed. Performing migration...") command.upgrade(alembic_cfg, "head") + if session.get_bind().name == "postgresql": # needed for fuzzy search and fast GIN text indices + session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;")) + db = get_repositories(session) if db.users.get_all(): diff --git a/mealie/db/models/recipe/ingredient.py b/mealie/db/models/recipe/ingredient.py index d3c8c1c873..8813603b2d 100644 --- a/mealie/db/models/recipe/ingredient.py +++ b/mealie/db/models/recipe/ingredient.py @@ -1,7 +1,9 @@ from typing import TYPE_CHECKING +import sqlalchemy as sa from sqlalchemy import Boolean, Float, ForeignKey, Integer, String, event, orm from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.orm.session import Session from text_unidecode import unidecode from mealie.db.models._model_base import BaseMixins, SqlAlchemyBase @@ -87,7 +89,7 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins): original_text_normalized: Mapped[str | None] = mapped_column(String, index=True) @auto_init() - def __init__(self, note: str | None = None, orginal_text: str | None = None, **_) -> None: + def __init__(self, session: Session, note: str | None = None, orginal_text: str | None = None, **_) -> None: # SQLAlchemy events do not seem to register things that are set during auto_init if note is not None: self.note_normalized = unidecode(note).lower().strip() @@ -95,13 +97,51 @@ def __init__(self, note: str | None = None, orginal_text: str | None = None, **_ if orginal_text is not None: self.orginal_text = unidecode(orginal_text).lower().strip() + tableargs = [ # base set of indices + sa.Index( + "ix_recipes_ingredients_note_normalized", + "note_normalized", + unique=False, + ), + sa.Index( + "ix_recipes_ingredients_original_text_normalized", + "original_text_normalized", + unique=False, + ), + ] + if session.get_bind().name == "postgresql": + tableargs.extend( + [ + sa.Index( + "ix_recipes_ingredients_note_normalized_gin", + "note_normalized", + unique=False, + postgresql_using="gin", + postgresql_ops={ + "note_normalized": "gin_trgm_ops", + }, + ), + sa.Index( + "ix_recipes_ingredients_original_text_normalized_gin", + "original_text", + unique=False, + postgresql_using="gin", + postgresql_ops={ + "original_text_normalized": "gin_trgm_ops", + }, + ), + ] + ) + # add indices + self.__table_args__ = tuple(tableargs) + @event.listens_for(RecipeIngredientModel.note, "set") def receive_note(target: RecipeIngredientModel, value: str, oldvalue, initiator): if value is not None: - target.name_normalized = unidecode(value).lower().strip() + target.note_normalized = unidecode(value).lower().strip() else: - target.name_normalized = None + target.note_normalized = None @event.listens_for(RecipeIngredientModel.original_text, "set") diff --git a/mealie/db/models/recipe/recipe.py b/mealie/db/models/recipe/recipe.py index 2f97981606..b181a23208 100644 --- a/mealie/db/models/recipe/recipe.py +++ b/mealie/db/models/recipe/recipe.py @@ -35,7 +35,9 @@ class RecipeModel(SqlAlchemyBase, BaseMixins): __tablename__ = "recipes" - __table_args__ = (sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),) + __table_args__: tuple[sa.UniqueConstraint, ...] = ( + sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"), + ) id: Mapped[GUID] = mapped_column(GUID, primary_key=True, default=GUID.generate) slug: Mapped[str | None] = mapped_column(sa.String, index=True) @@ -192,6 +194,46 @@ def __init__( if description is not None: self.description_normalized = unidecode(description).lower().strip() + tableargs = [ # base set of indices + sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"), + sa.Index( + "ix_recipes_name_normalized", + "name_normalized", + unique=False, + ), + sa.Index( + "ix_recipes_description_normalized", + "description_normalized", + unique=False, + ), + ] + + if session.get_bind().name == "postgresql": + tableargs.extend( + [ + sa.Index( + "ix_recipes_name_normalized_gin", + "name_normalized", + unique=False, + postgresql_using="gin", + postgresql_ops={ + "name_normalized": "gin_trgm_ops", + }, + ), + sa.Index( + "ix_recipes_description_normalized_gin", + "description_normalized", + unique=False, + postgresql_using="gin", + postgresql_ops={ + "description_normalized": "gin_trgm_ops", + }, + ), + ] + ) + # add indices + self.__table_args__ = tuple(tableargs) + @event.listens_for(RecipeModel.name, "set") def receive_name(target: RecipeModel, value: str, oldvalue, initiator): diff --git a/mealie/repos/repository_recipes.py b/mealie/repos/repository_recipes.py index 947b75a334..a782daf471 100644 --- a/mealie/repos/repository_recipes.py +++ b/mealie/repos/repository_recipes.py @@ -1,10 +1,11 @@ +import re as re from collections.abc import Sequence from random import randint from uuid import UUID from pydantic import UUID4 from slugify import slugify -from sqlalchemy import Select, and_, desc, func, or_, select +from sqlalchemy import Select, and_, desc, func, or_, select, text from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import joinedload from text_unidecode import unidecode @@ -151,29 +152,94 @@ def _uuids_for_items(self, items: list[UUID | str] | None, model: type[SqlAlchem return ids + additional_ids def _add_search_to_query(self, query: Select, search: str) -> Select: + """ + 0. fuzzy search (postgres only) and tokenized search are performed separately + 1. take search string and do a little pre-normalization + 2. look for internal quoted strings and keep them together as "literal" parts of the search + 3. remove special characters from each non-literal search string + 4. token search looks for any individual exact hit in name, description, and ingredients + 5. fuzzy search looks for trigram hits in name, description, and ingredients + 6. Sort order is determined by closeness to the recipe name + Should search also look at tags? + """ + normalized_search = unidecode(search).lower().strip() + punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed + # keep quoted phrases together as literal portions of the search string + literal = False + quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""") # thank you stack exchange! + removequotes_regex = re.compile(r"""['"](.*)['"]""") + if quoted_regex.search(normalized_search): + literal = True + temp = normalized_search + quoted_search_list = [match.group() for match in quoted_regex.finditer(temp)] # all quoted strings + quoted_search_list = [removequotes_regex.sub("\\1", x) for x in quoted_search_list] # remove outer quotes + temp = quoted_regex.sub("", temp) # remove all quoted strings, leaving just non-quoted + temp = temp.translate( + str.maketrans(punctuation, " " * len(punctuation)) + ) # punctuation->spaces for splitting, but only on unquoted strings + unquoted_search_list = temp.split() # all unquoted strings + normalized_search_list = quoted_search_list + unquoted_search_list + else: + # + normalized_search = normalized_search.translate(str.maketrans(punctuation, " " * len(punctuation))) + normalized_search_list = normalized_search.split() + normalized_search_list = [x.strip() for x in normalized_search_list] # remove padding whitespace inside quotes # I would prefer to just do this in the recipe_ingredient.any part of the main query, but it turns out # that at least sqlite wont use indexes for that correctly anymore and takes a big hit, so prefiltering it is - ingredient_ids = ( - self.session.execute( - select(RecipeIngredientModel.id).filter( - or_( - RecipeIngredientModel.note_normalized.like(f"%{normalized_search}%"), - RecipeIngredientModel.original_text_normalized.like(f"%{normalized_search}%"), + if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search + ingredient_ids = ( + self.session.execute( + select(RecipeIngredientModel.id).filter( + or_( + RecipeIngredientModel.note_normalized.op("%>")(normalized_search), + RecipeIngredientModel.original_text_normalized.op("%>")(normalized_search), + ) ) ) + .scalars() + .all() + ) + else: # exact token search + ingredient_ids = ( + self.session.execute( + select(RecipeIngredientModel.id).filter( + or_( + *[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in normalized_search_list], + *[ + RecipeIngredientModel.original_text_normalized.like(f"%{ns}%") + for ns in normalized_search_list + ], + ) + ) + ) + .scalars() + .all() ) - .scalars() - .all() - ) - q = query.filter( - or_( - RecipeModel.name_normalized.like(f"%{normalized_search}%"), - RecipeModel.description_normalized.like(f"%{normalized_search}%"), - RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)), + if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search + # default = 0.7 is too strict for effective fuzzing + self.session.execute(text("set pg_trgm.word_similarity_threshold = 0.5;")) + q = query.filter( + or_( + RecipeModel.name_normalized.op("%>")(normalized_search), + RecipeModel.description_normalized.op("%>")(normalized_search), + RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)), + ) + ).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands. + func.least( + RecipeModel.name_normalized.op("<->>")(normalized_search), + ) ) - ).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%"))) + else: # exact token search + q = query.filter( + or_( + *[RecipeModel.name_normalized.like(f"%{ns}%") for ns in normalized_search_list], + *[RecipeModel.description_normalized.like(f"%{ns}%") for ns in normalized_search_list], + RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)), + ) + ).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%"))) + return q def page_all( diff --git a/tests/unit_tests/repository_tests/test_recipe_repository.py b/tests/unit_tests/repository_tests/test_recipe_repository.py index bc424f4eaa..5ffcc23292 100644 --- a/tests/unit_tests/repository_tests/test_recipe_repository.py +++ b/tests/unit_tests/repository_tests/test_recipe_repository.py @@ -432,36 +432,28 @@ def test_recipe_repo_pagination_by_foods(database: AllRepositories, unique_user: def test_recipe_repo_search(database: AllRepositories, unique_user: TestUser): - ingredient_1 = random_string(10) - ingredient_2 = random_string(10) - name_part_1 = random_string(10) - name_1 = f"{name_part_1} soup" - name_part_2 = random_string(10) - name_2 = f"Rustic {name_part_2} stew" - name_3 = f"{ingredient_1} Soup" - description_part_1 = random_string(10) recipes = [ Recipe( user_id=unique_user.user_id, group_id=unique_user.group_id, - name=name_1, - description=f"My favorite {description_part_1}", + name="Steinbock Sloop", + description=f"My favorite horns are delicious", recipe_ingredient=[ - RecipeIngredient(note=ingredient_1), + RecipeIngredient(note="alpine animal"), ], ), Recipe( user_id=unique_user.user_id, group_id=unique_user.group_id, - name=name_2, + name="Fiddlehead Fern Stir Fry", recipe_ingredient=[ - RecipeIngredient(note=ingredient_2), + RecipeIngredient(note="moss"), ], ), Recipe( user_id=unique_user.user_id, group_id=unique_user.group_id, - name=name_3, + name="Animal Sloop", ), # Test diacritics Recipe( @@ -481,28 +473,50 @@ def test_recipe_repo_search(database: AllRepositories, unique_user: TestUser): assert len(empty_result) == 0 # Search by title - title_result = database.recipes.page_all(pagination_query, search=name_part_2).items + title_result = database.recipes.page_all(pagination_query, search="Steinbock").items assert len(title_result) == 1 - assert title_result[0].name == name_2 + assert title_result[0].name == "Steinbock Sloop" # Search by description - description_result = database.recipes.page_all(pagination_query, search=description_part_1).items + description_result = database.recipes.page_all(pagination_query, search="horns").items assert len(description_result) == 1 - assert description_result[0].name == name_1 + assert description_result[0].name == "Steinbock Sloop" # Search by ingredient - ingredient_result = database.recipes.page_all(pagination_query, search=ingredient_2).items + ingredient_result = database.recipes.page_all(pagination_query, search="moss").items assert len(ingredient_result) == 1 - assert ingredient_result[0].name == name_2 + assert ingredient_result[0].name == "Fiddlehead Fern Stir Fry" # Make sure title matches are ordered in front - ordered_result = database.recipes.page_all(pagination_query, search=ingredient_1).items + ordered_result = database.recipes.page_all(pagination_query, search="animal sloop").items assert len(ordered_result) == 2 - assert ordered_result[0].name == name_3 - assert ordered_result[1].name == name_1 + assert ordered_result[0].name == "Animal Sloop" + assert ordered_result[1].name == "Steinbock Sloop" + + # Test literal search + literal_result = database.recipes.page_all(pagination_query, search='"Animal Sloop"').items + assert len(literal_result) == 1 + assert literal_result[0].name == "Animal Sloop" + + # Test special character removal from non-literal searches + character_result = database.recipes.page_all(pagination_query, search="animal-sloop").items + assert len(character_result) == 2 + assert character_result[0].name == "Animal Sloop" + assert character_result[1].name == "Steinbock Sloop" # Test string normalization normalized_result = database.recipes.page_all(pagination_query, search="ratat").items print([r.name for r in normalized_result]) assert len(normalized_result) == 1 assert normalized_result[0].name == "Rátàtôuile" + + # Test token separation + token_result = database.recipes.page_all(pagination_query, search="delicious horns").items + assert len(token_result) == 1 + assert token_result[0].name == "Steinbock Sloop" + + # Test fuzzy search + if database.session.get_bind().name == "postgresql": + fuzzy_result = database.recipes.page_all(pagination_query, search="Steinbuck").items + assert len(fuzzy_result) == 1 + assert fuzzy_result[0].name == "Steinbock Sloop"