Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reworking search: tokenization, handling of quoted literal search, and postgres fuzziness #2351

Merged
merged 36 commits into from
May 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2df3731
Creating postgres migration script and starting to set up to detect d…
jecorn Apr 13, 2023
72e70f5
non-working placeholders for postgres pg_tgrm
jecorn Apr 13, 2023
a8b7ea8
First draft of some indexes
jecorn Apr 13, 2023
80d1b5b
non-working commit of postgres indexing
jecorn Apr 14, 2023
3fe9da0
Further non-working edits to db-centric fuzzy search
jecorn Apr 14, 2023
04f9896
update alembic for extensions
jecorn Apr 14, 2023
54c1cac
More non-working setup
jecorn Apr 14, 2023
d4c7256
Move db type check to init_db
jecorn Apr 15, 2023
853c43f
fix typo in db name check
jecorn Apr 15, 2023
86c6fca
Add sqlite token search and postgres full text search
jecorn Apr 15, 2023
1378be7
reorder search to hit exact matches faster
jecorn Apr 15, 2023
4ddaa8f
Add settings and docs for POSTGRES_LANGUAGE (full text search)
jecorn Apr 16, 2023
22256af
Use user-specified POSTGRES_LANGUAGE in search
jecorn Apr 16, 2023
98f6a28
fix fuzzy search typo
jecorn Apr 17, 2023
e64082f
Remove full text search and instead order by trigram match
jecorn Apr 17, 2023
3346811
cleaner adding of indices, remove fulltext
jecorn Apr 17, 2023
41dda03
Cleanup old import of getting app settings
jecorn Apr 17, 2023
53e7fa0
Fix typo in index
jecorn Apr 17, 2023
236e166
Fix some alembic fuzzy typos
jecorn Apr 17, 2023
adce758
Remove diagnostic printing from alembic migration
jecorn Apr 17, 2023
021bb2b
Fix mixed up commutator for trigram operator and relax criteria
jecorn Apr 17, 2023
0ee9d55
forgot to remove query debug
jecorn Apr 17, 2023
8f1d1da
sort only on name
jecorn Apr 17, 2023
81947f4
token and fuzzy search tests
jecorn Apr 17, 2023
ba250c4
Refactor recipe search test to avoid rare random string cross-matches.
jecorn Apr 18, 2023
605b2de
Add ability to quote parts of search for exact match
jecorn Apr 18, 2023
54aad08
Remove internal punctuation, unless it's quoted for literal search
jecorn Apr 18, 2023
6cfe202
Add tests for special character removal and literal search
jecorn Apr 18, 2023
dfc602b
Remove the outer double quotes from searches, but leave internal sing…
jecorn Apr 18, 2023
37b2ba6
Update tests to avoid intra-test name collisions
jecorn Apr 18, 2023
2a4a6a1
Fixing leftovers highlighted by lint
jecorn Apr 18, 2023
e7d7da9
cleanup linting and mypy errors
jecorn Apr 19, 2023
0eccb76
Fix test cross-matching on dirty db (leftovers from bulk import)
jecorn Apr 19, 2023
7492a7f
forgot to cleanup something when debugging mypy errors
jecorn Apr 19, 2023
2095a47
re-order pg_trgm loading in postgres
jecorn Apr 19, 2023
dc2d56c
address comments
jecorn May 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""postgres fuzzy search

Revision ID: b3dbb554ba53
Revises: 38514b39a824
Create Date: 2023-04-13 06:47:04.617131

"""
import sqlalchemy as sa

import mealie.db.migration_types
from alembic import op
import alembic.context as context
from mealie.core.config import get_app_settings

# revision identifiers, used by Alembic.
revision = "b3dbb554ba53"
down_revision = "38514b39a824"
branch_labels = None
depends_on = None


def get_db_type():
return op.get_context().dialect.name


def setup_postgres_trigrams():
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm;")
op.create_index(
"ix_recipes_name_normalized_gin",
table_name="recipes",
columns=["name_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"name_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_description_normalized_gin",
table_name="recipes",
columns=["description_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"description_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_ingredients_note_normalized_gin",
table_name="recipes_ingredients",
columns=["note_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"note_normalized": "gin_trgm_ops",
},
)
op.create_index(
"ix_recipes_ingredients_original_text_normalized_gin",
table_name="recipes_ingredients",
columns=["original_text_normalized"],
unique=False,
postgresql_using="gin",
postgresql_ops={
"original_text_normalized": "gin_trgm_ops",
},
)


def remove_postgres_trigrams():
op.execute("DROP EXTENSION IF EXISTS pg_trgm;")
op.drop_index("ix_recipes_name_normalized_gin", table_name="recipe")
op.drop_index("ix_recipes_description_normalized_gin", table_name="recipe")
op.drop_index("ix_recipes_ingredients_note_normalized_gin", table_name="recipes_ingredients")
op.drop_index("ix_recipes_ingredients_original_text_normalized_gin", table_name="recipes_ingredients")


def upgrade():
if get_db_type() == "postgresql":
setup_postgres_trigrams()
else:
pass


def downgrade():
if get_db_type() == "postgres":
remove_postgres_trigrams()
else:
pass
7 changes: 7 additions & 0 deletions docs/docs/documentation/getting-started/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ Yes, you can install Mealie on your local machine. HOWEVER, it is recommended th

- [Advanced Installation](../installation/advanced/)

## What is fuzzy search and how do I use it?
Mealie can use fuzzy search, which is robust to minor typos. For example, searching for "brocolli" will still find your recipe for "broccoli soup". But fuzzy search is only functional on a Postgres database backend. To enable fuzzy search you will need to migrate to Postgres:

1. Backup your database and download the .zip file (same as when [migrating](./migrating-to-mealie-v1.md))
2. Set up a [Postgres](./installation/postgres.md) instance of Mealie
3. Upload the backup .zip and click to apply it (as as migration)

## How i can attach an image or video to a Recipe?

Yes. Mealie's Recipe Steps and other fields support the markdown syntax and therefor supports images and videos. To attach an image to the recipe, you can upload it as an asset and use the provided copy button to generate the html image tag required to render the image. For videos, Mealie provides no way to host videos. You'll need to host your videos with another provider and embed them in your recipe. Generally, the video provider will provide a link to the video and the html tag required to render the video. For example, youtube provides the following link that works inside a step. You can adjust the width and height attributes as necessary to ensure a fit.
Expand Down
3 changes: 3 additions & 0 deletions mealie/db/init_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ def main():
logger.info("Migration needed. Performing migration...")
command.upgrade(alembic_cfg, "head")

if session.get_bind().name == "postgresql": # needed for fuzzy search and fast GIN text indices
session.execute(text("CREATE EXTENSION IF NOT EXISTS pg_trgm;"))

db = get_repositories(session)

if db.users.get_all():
Expand Down
46 changes: 43 additions & 3 deletions mealie/db/models/recipe/ingredient.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import TYPE_CHECKING

import sqlalchemy as sa
from sqlalchemy import Boolean, Float, ForeignKey, Integer, String, event, orm
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm.session import Session
from text_unidecode import unidecode

from mealie.db.models._model_base import BaseMixins, SqlAlchemyBase
Expand Down Expand Up @@ -87,21 +89,59 @@ class RecipeIngredientModel(SqlAlchemyBase, BaseMixins):
original_text_normalized: Mapped[str | None] = mapped_column(String, index=True)

@auto_init()
def __init__(self, note: str | None = None, orginal_text: str | None = None, **_) -> None:
def __init__(self, session: Session, note: str | None = None, orginal_text: str | None = None, **_) -> None:
# SQLAlchemy events do not seem to register things that are set during auto_init
if note is not None:
self.note_normalized = unidecode(note).lower().strip()

if orginal_text is not None:
self.orginal_text = unidecode(orginal_text).lower().strip()

tableargs = [ # base set of indices
sa.Index(
"ix_recipes_ingredients_note_normalized",
"note_normalized",
unique=False,
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized",
"original_text_normalized",
unique=False,
),
]
if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_ingredients_note_normalized_gin",
"note_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"note_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_ingredients_original_text_normalized_gin",
"original_text",
unique=False,
postgresql_using="gin",
postgresql_ops={
"original_text_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)


@event.listens_for(RecipeIngredientModel.note, "set")
def receive_note(target: RecipeIngredientModel, value: str, oldvalue, initiator):
if value is not None:
target.name_normalized = unidecode(value).lower().strip()
target.note_normalized = unidecode(value).lower().strip()
else:
target.name_normalized = None
target.note_normalized = None


@event.listens_for(RecipeIngredientModel.original_text, "set")
Expand Down
44 changes: 43 additions & 1 deletion mealie/db/models/recipe/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@

class RecipeModel(SqlAlchemyBase, BaseMixins):
__tablename__ = "recipes"
__table_args__ = (sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),)
__table_args__: tuple[sa.UniqueConstraint, ...] = (
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
)

id: Mapped[GUID] = mapped_column(GUID, primary_key=True, default=GUID.generate)
slug: Mapped[str | None] = mapped_column(sa.String, index=True)
Expand Down Expand Up @@ -192,6 +194,46 @@ def __init__(
if description is not None:
self.description_normalized = unidecode(description).lower().strip()

tableargs = [ # base set of indices
sa.UniqueConstraint("slug", "group_id", name="recipe_slug_group_id_key"),
sa.Index(
"ix_recipes_name_normalized",
"name_normalized",
unique=False,
),
sa.Index(
"ix_recipes_description_normalized",
"description_normalized",
unique=False,
),
]

if session.get_bind().name == "postgresql":
tableargs.extend(
[
sa.Index(
"ix_recipes_name_normalized_gin",
"name_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"name_normalized": "gin_trgm_ops",
},
),
sa.Index(
"ix_recipes_description_normalized_gin",
"description_normalized",
unique=False,
postgresql_using="gin",
postgresql_ops={
"description_normalized": "gin_trgm_ops",
},
),
]
)
# add indices
self.__table_args__ = tuple(tableargs)


@event.listens_for(RecipeModel.name, "set")
def receive_name(target: RecipeModel, value: str, oldvalue, initiator):
Expand Down
98 changes: 82 additions & 16 deletions mealie/repos/repository_recipes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import re as re
from collections.abc import Sequence
from random import randint
from uuid import UUID

from pydantic import UUID4
from slugify import slugify
from sqlalchemy import Select, and_, desc, func, or_, select
from sqlalchemy import Select, and_, desc, func, or_, select, text
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import joinedload
from text_unidecode import unidecode
Expand Down Expand Up @@ -151,29 +152,94 @@ def _uuids_for_items(self, items: list[UUID | str] | None, model: type[SqlAlchem
return ids + additional_ids

def _add_search_to_query(self, query: Select, search: str) -> Select:
"""
0. fuzzy search (postgres only) and tokenized search are performed separately
1. take search string and do a little pre-normalization
2. look for internal quoted strings and keep them together as "literal" parts of the search
3. remove special characters from each non-literal search string
4. token search looks for any individual exact hit in name, description, and ingredients
5. fuzzy search looks for trigram hits in name, description, and ingredients
6. Sort order is determined by closeness to the recipe name
Should search also look at tags?
"""

normalized_search = unidecode(search).lower().strip()
punctuation = "!\#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # string.punctuation with ' & " removed
# keep quoted phrases together as literal portions of the search string
literal = False
quoted_regex = re.compile(r"""(["'])(?:(?=(\\?))\2.)*?\1""") # thank you stack exchange!
jecorn marked this conversation as resolved.
Show resolved Hide resolved
removequotes_regex = re.compile(r"""['"](.*)['"]""")
if quoted_regex.search(normalized_search):
literal = True
temp = normalized_search
quoted_search_list = [match.group() for match in quoted_regex.finditer(temp)] # all quoted strings
quoted_search_list = [removequotes_regex.sub("\\1", x) for x in quoted_search_list] # remove outer quotes
temp = quoted_regex.sub("", temp) # remove all quoted strings, leaving just non-quoted
temp = temp.translate(
str.maketrans(punctuation, " " * len(punctuation))
) # punctuation->spaces for splitting, but only on unquoted strings
unquoted_search_list = temp.split() # all unquoted strings
normalized_search_list = quoted_search_list + unquoted_search_list
else:
#
normalized_search = normalized_search.translate(str.maketrans(punctuation, " " * len(punctuation)))
normalized_search_list = normalized_search.split()
normalized_search_list = [x.strip() for x in normalized_search_list] # remove padding whitespace inside quotes
# I would prefer to just do this in the recipe_ingredient.any part of the main query, but it turns out
# that at least sqlite wont use indexes for that correctly anymore and takes a big hit, so prefiltering it is
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.like(f"%{normalized_search}%"),
RecipeIngredientModel.original_text_normalized.like(f"%{normalized_search}%"),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
RecipeIngredientModel.note_normalized.op("%>")(normalized_search),
RecipeIngredientModel.original_text_normalized.op("%>")(normalized_search),
)
)
)
.scalars()
.all()
)
else: # exact token search
ingredient_ids = (
self.session.execute(
select(RecipeIngredientModel.id).filter(
or_(
*[RecipeIngredientModel.note_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[
RecipeIngredientModel.original_text_normalized.like(f"%{ns}%")
for ns in normalized_search_list
],
)
)
)
.scalars()
.all()
)
.scalars()
.all()
)

q = query.filter(
or_(
RecipeModel.name_normalized.like(f"%{normalized_search}%"),
RecipeModel.description_normalized.like(f"%{normalized_search}%"),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
if (self.session.get_bind().name == "postgresql") & (literal is False): # fuzzy search
# default = 0.7 is too strict for effective fuzzing
self.session.execute(text("set pg_trgm.word_similarity_threshold = 0.5;"))
q = query.filter(
or_(
RecipeModel.name_normalized.op("%>")(normalized_search),
RecipeModel.description_normalized.op("%>")(normalized_search),
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by( # trigram ordering could be too slow on million record db, but is fine with thousands.
func.least(
RecipeModel.name_normalized.op("<->>")(normalized_search),
)
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))
else: # exact token search
q = query.filter(
or_(
*[RecipeModel.name_normalized.like(f"%{ns}%") for ns in normalized_search_list],
*[RecipeModel.description_normalized.like(f"%{ns}%") for ns in normalized_search_list],
RecipeModel.recipe_ingredient.any(RecipeIngredientModel.id.in_(ingredient_ids)),
)
).order_by(desc(RecipeModel.name_normalized.like(f"%{normalized_search}%")))

return q

def page_all(
Expand Down
Loading