openfoodfacts · alexgarel · Oct 24, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -40,6 +40,10 @@ COPY --from=builder-base $POETRY_HOME $POETRY_HOME
 RUN poetry config virtualenvs.create false
 ENV POETRY_VIRTUALENVS_IN_PROJECT=false
 
+# create some folders, to later ensure right ownership
+RUN mkdir -p /opt/search/data && \
+    mkdir -p /opt/search/synonyms
+
 # create off user
 ARG USER_UID
 ARG USER_GID

diff --git a/Makefile b/Makefile
@@ -23,6 +23,7 @@ endif
 DOCKER_COMPOSE=docker compose --env-file=${ENV_FILE}
 DOCKER_COMPOSE_TEST=COMPOSE_PROJECT_NAME=search_test docker compose --env-file=${ENV_FILE}
 
+.PHONY: build create_external_volumes livecheck up down test test_front test_front_watch test_api import-dataset import-taxonomies sync-scripts build-translations generate-openapi check check_front check_translations lint lint_back lint_front
 #------------#
 # Production #
 #------------#

diff --git a/app/_import.py b/app/_import.py
@@ -12,13 +12,14 @@
 from redis import Redis
 
 from app._types import FetcherResult, FetcherStatus, JSONType
-from app.config import Config, IndexConfig, TaxonomyConfig
+from app.config import Config, IndexConfig, TaxonomyConfig, settings
 from app.indexing import (
     DocumentProcessor,
     generate_index_object,
     generate_taxonomy_index_object,
 )
-from app.taxonomy import get_taxonomy
+from app.taxonomy import iter_taxonomies
+from app.taxonomy_es import refresh_synonyms
 from app.utils import connection, get_logger, load_class_object_from_string
 from app.utils.io import jsonl_iter
 
@@ -260,10 +261,7 @@ def gen_taxonomy_documents(
     :param supported_langs: a set of supported languages
     :yield: a dict with the document to index, compatible with ES bulk API
     """
-    for taxonomy_source_config in tqdm.tqdm(taxonomy_config.sources):
-        taxonomy = get_taxonomy(
-            taxonomy_source_config.name, str(taxonomy_source_config.url)
-        )
+    for taxonomy_name, taxonomy in tqdm.tqdm(iter_taxonomies(taxonomy_config)):
         for node in taxonomy.iter_nodes():
             names = {}
             for lang in supported_langs:
@@ -278,7 +276,7 @@ def gen_taxonomy_documents(
                 "_index": next_index,
                 "_source": {
                     "id": node.id,
-                    "taxonomy_name": taxonomy_source_config.name,
+                    "taxonomy_name": taxonomy_name,
                     "names": names,
                 },
             }
@@ -353,8 +351,7 @@ def import_taxonomies(config: IndexConfig, next_index: str):
     :param config: the index configuration to use
     :param next_index: the index to write to
     """
-    # open a connection for this process
-    es = connection.get_es_client(timeout=120, retry_on_timeout=True)
+    es = connection.current_es_client()
     # Note that bulk works better than parallel bulk for our usecase.
     # The preprocessing in this file is non-trivial, so it's better to
     # parallelize that. If we then do parallel_bulk
@@ -542,6 +539,11 @@ def perform_taxonomy_import(config: IndexConfig) -> None:
     update_alias(es_client, next_index, config.taxonomy.index.name)
 
 
+def perform_refresh_synonyms(index_id: str, config: IndexConfig) -> None:
+    """Refresh synonyms files generated by taxonomies"""
+    refresh_synonyms(index_id, config, settings.synonyms_path)
+
+
 def run_update_daemon(config: Config) -> None:
     """Run the update import daemon.
 

diff --git a/app/cli/main.py b/app/cli/main.py
@@ -131,24 +131,53 @@ def import_taxonomies(
         default=None,
         help=INDEX_ID_HELP,
     ),
+    skip_indexing: bool = typer.Option(
+        default=False,
+        help="Skip putting taxonomies in the ES index",
+    ),
+    skip_synonyms: bool = typer.Option(
+        default=False,
+        help="Skip creating synonyms files for ES analyzers",
+    ),
 ):
     """Import taxonomies into Elasticsearch.
 
-    It get taxonomies json files as specified in the configuration file.
+    It download taxonomies json files as specified in the configuration file.
+
+    It creates taxonomies indexes (for auto-completion).
+
+    It creates synonyms files for ElasticSearch analyzers
+    (enabling full text search to benefits from synonyms).
     """
     import time
 
-    from app._import import perform_taxonomy_import
-    from app.utils import get_logger
+    from app._import import perform_refresh_synonyms, perform_taxonomy_import
+    from app.utils import connection, get_logger
 
     logger = get_logger()
 
     index_id, index_config = _get_index_config(config_path, index_id)
 
-    start_time = time.perf_counter()
-    perform_taxonomy_import(index_config)
-    end_time = time.perf_counter()
-    logger.info("Import time: %s seconds", end_time - start_time)
+    # open a connection for this process
+    connection.get_es_client(timeout=120, retry_on_timeout=True)
+
+    if skip_indexing:
+        logger.info("Skipping indexing of taxonomies")
+    else:
+        start_time = time.perf_counter()
+        perform_taxonomy_import(index_config)
+        end_time = time.perf_counter()
+        logger.info("Import time: %s seconds", end_time - start_time)
+    if skip_synonyms:
+        logger.info("Skipping synonyms generation")
+    else:
+        start_time = time.perf_counter()
+        perform_refresh_synonyms(
+            index_id,
+            index_config,
+        )
+        end_time = time.perf_counter()
+        logger.info("Synonyms generation time: %s seconds", end_time - start_time)
 
 
 @cli.command()

diff --git a/app/config.py b/app/config.py
@@ -121,6 +121,12 @@ class Settings(BaseSettings):
             description="User-Agent used when fetching resources (taxonomies) or documents"
         ),
     ] = "search-a-licious"
+    synonyms_path: Annotated[
+        Path,
+        Field(
+            description="Path of the directory that will contain synonyms for ElasticSearch instances"
+        ),
+    ] = Path("/opt/search/synonyms")
 
 
 settings = Settings()
@@ -228,6 +234,7 @@ class FieldType(StrEnum):
       Tokenization will use analyzers specific to each languages.
     * taxonomy: a field akin to keyword but
       with support for matching using taxonomy synonyms and translations
+      (and in fact also a text mapping possibility)
     * disabled: a field that is not stored nor searchable
       (see [Elasticsearch help])
     * object: this field contains a dict with sub-fields.
@@ -474,11 +481,16 @@ class TaxonomyConfig(BaseModel):
     """Configuration of taxonomies,
     that is collections of entries with synonyms in multiple languages.
 
+    See [Explain taxonomies](../explain-taxonomies)
+
     Field may be linked to taxonomies.
 
     It enables enriching search with synonyms,
     as well as providing suggestions,
     or informative facets.
+
+    Note: if you define taxonomies, you must import them using
+    [import-taxonomies command](../ref-python/cli.html#python3-m-app-import-taxonomies)
     """
 
     sources: Annotated[

diff --git a/app/indexing.py b/app/indexing.py
@@ -18,7 +18,7 @@
 )
 from app.taxonomy import get_taxonomy
 from app.utils import load_class_object_from_string
-from app.utils.analyzers import AUTOCOMPLETE_ANALYZERS
+from app.utils.analyzers import get_autocomplete_analyzer
 
 FIELD_TYPE_TO_DSL_TYPE = {
     FieldType.keyword: dsl_field.Keyword,
@@ -405,7 +405,7 @@ def generate_taxonomy_mapping_object(config: IndexConfig) -> Mapping:
             dynamic=False,
             properties={
                 lang: dsl_field.Completion(
-                    analyzer=AUTOCOMPLETE_ANALYZERS.get(lang, "simple"),
+                    analyzer=get_autocomplete_analyzer(lang),
                     contexts=[
                         {
                             "name": "taxonomy_name",

diff --git a/app/taxonomy.py b/app/taxonomy.py
@@ -3,14 +3,15 @@
 See also :py:mod:`app.taxonomy_es`
 """
 
+from collections.abc import Iterator
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
 import cachetools
 import requests
 
 from app._types import JSONType
-from app.config import settings
+from app.config import TaxonomyConfig, settings
 from app.utils import get_logger
 from app.utils.download import download_file, http_session, should_download_file
 from app.utils.io import load_json
@@ -353,3 +354,10 @@ def get_taxonomy(
     logger.info("Downloading taxonomy, saving it in %s", taxonomy_path)
     download_file(taxonomy_url, taxonomy_path)
     return Taxonomy.from_path(taxonomy_path)
+
+
+def iter_taxonomies(taxonomy_config: TaxonomyConfig) -> Iterator[tuple[str, Taxonomy]]:
+    for taxonomy_source_config in taxonomy_config.sources:
+        yield taxonomy_source_config.name, get_taxonomy(
+            taxonomy_source_config.name, str(taxonomy_source_config.url)
+        )
diff --git a/app/taxonomy_es.py b/app/taxonomy_es.py
@@ -3,10 +3,16 @@
 See also :py:mod:`app.taxonomy`
 """
 
+import os
+import shutil
+from pathlib import Path
+
 from elasticsearch_dsl import Search
 from elasticsearch_dsl.query import Q
 
 from app.config import IndexConfig
+from app.taxonomy import Taxonomy, iter_taxonomies
+from app.utils import connection
 
 
 def get_taxonomy_names(
@@ -27,3 +33,54 @@ def get_taxonomy_names(
         (result.id, result.taxonomy_name): result.names.to_dict()
         for result in query.execute().hits
     }
+
+
+def create_synonyms_files(taxonomy: Taxonomy, langs: list[str], target_dir: Path):
+    """Create a set of files that can be used to define a Synonym Graph Token Filter
+
+    see:
+    https://www.elastic.co/guide/en/elasticsearch/reference/current/search-with-synonyms.html#synonyms-store-synonyms-file
+    """
+
+    # auto-generate synonyms files for each language, ready to write to
+    synonyms_paths = {lang: (target_dir / f"{lang}.txt") for lang in langs}
+    synonyms_files = {lang: fpath.open("w") for lang, fpath in synonyms_paths.items()}
+
+    for node in taxonomy.iter_nodes():
+        for lang, synonyms in node.synonyms.items():
+            if not synonyms or lang not in langs:
+                continue
+            # avoid commas in synonyms…
+            synonyms = [s.replace(",", " ") for s in synonyms]
+            synonyms_files[lang].write(f"{','.join(synonyms)} => {node.id}\n")
+
+    # close files
+    for f in synonyms_files.values():
+        f.close()
+
+
+def create_synonyms(index_config: IndexConfig, target_dir: Path):
+    for name, taxonomy in iter_taxonomies(index_config.taxonomy):
+        target = target_dir / name
+        # a temporary directory, we move at the end
+        target_tmp = target_dir / f"{name}.tmp"
+        # ensure directory
+        os.makedirs(target_tmp, mode=0o775, exist_ok=True)
+        # generate synonyms files
+        create_synonyms_files(taxonomy, index_config.supported_langs, target_tmp)
+        # move to final location, overriding previous files
+        shutil.move(target, str(target) + ".old")
+        shutil.move(target_tmp, target)
+        shutil.rmtree(str(target) + ".old")
+        # Note: in current deployment, file are shared between ES instance,
+        # so we don't need to replicate the files
+
+
+def refresh_synonyms(index_name: str, index_config: IndexConfig, target_dir: Path):
+    create_synonyms(index_config, target_dir)
+    es = connection.current_es_client()
+    if es.indices.exists(index=index_name):
+        # trigger update of synonyms in token filters by reloading search analyzers
+        # and clearing relevant cache
+        es.reload_search_analyzers(index_name)
+        es.clear_cache(index_name, request=True)
diff --git a/app/utils/analyzers.py b/app/utils/analyzers.py
@@ -1,15 +1,54 @@
 """Defines some analyzers for the elesaticsearch fields."""
 
-from elasticsearch_dsl import analyzer
-
-#: An analyzer for the autocomplete field
-AUTOCOMPLETE_ANALYZERS = {
-    "fr": analyzer(
-        "autocomplete_fr", tokenizer="standard", filter=["lowercase", "asciifolding"]
-    ),
-    "de": analyzer(
-        "autocomplete_de",
-        tokenizer="standard",
-        filter=["lowercase", "german_normalization"],
-    ),
+from elasticsearch_dsl import analyzer, token_filter
+
+# some normalizers existing in ES that are specific to some languages
+SPECIAL_NORMALIZERS = {
+    "ar": "arabic_normalization",
+    "bn": "bengali_normalization",
+    "de": "german_normalization",
+    "hi": "hindi_normalization",
+    "inc": "indic_normalization",
+    "fa": "persian_normalization",
+    "sv": "scandinavian_folding",
+    "da": "scandinavian_folding",
+    "no": "scandinavian_folding",
+    "fi": "scandinavian_folding",
+    "sr": "serbian_normalization",
+    "ckb": "sorani_normalization",
 }
+
+
+def get_taxonomy_synonym_filter(taxonomy: str, lang: str) -> token_filter:
+    """Return the synonym filter to use for the taxonomized field analyzer"""
+    return token_filter(
+        f"synonym_graph_{taxonomy}_{lang}",
+        type="synonym_graph",
+        synonyms_path=f"synonyms/{taxonomy}/{lang}.txt",
+    )
+
+
+def get_taxonomy_analyzer(taxonomy: str, lang: str) -> analyzer:
+    """Return the search analyzer to use for the taxonomized field
+
+    It includes synonyms and some filters to improve the search
+    """
+    analyzer(
+        f"search_{taxonomy}_{lang}",
+        tokenizer="standard",
+        filter=[
+            "lowercase",
+            SPECIAL_NORMALIZERS.get(lang, "asciifolding"),
+            # this filter will be generated by `get_taxonomy_synonym_filter`
+            f"synonym_graph_{taxonomy}_{lang}",
+        ],
+    )
+
+
+def get_autocomplete_analyzer(lang: str) -> analyzer:
+    """Return the search analyzer to use for the autocomplete field"""
+    return analyzer(
+        f"autocomplete_{lang}",
+        tokenizer="standard",
+        filter=["lowercase", SPECIAL_NORMALIZERS.get(lang, "asciifolding")],
+    )
diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml
@@ -74,7 +74,8 @@ indices:
         bucket_agg: true
       quantity:
         type: text
-      categories_tags:
+      categories_tags: keyworld / categories_tags.txt text
+        // full_text_search: true
         type: keyword
         taxonomy_name: category
         bucket_agg: true