diff --git a/Makefile b/Makefile index bdec6eb9..fe7c74e4 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,11 @@ import-dataset: guard-filepath @echo "🔎 Importing data …" ${DOCKER_COMPOSE} run --rm api python3 -m app import /opt/search/data/${filepath} ${args} --num-processes=2 +import-taxonomies: + @echo "🔎 Importing taxonomies …" + ${DOCKER_COMPOSE} run --rm api python3 -m app import-taxonomies ${args} + + #-------# # Tests # diff --git a/README.md b/README.md index 296a4f0b..7cea99ef 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,7 @@ Now you can go to : or - http://localhost:8000/static/off.html to access to lit components search page +To look into the data, you may use elasticvue, going to http://127.0.0.1:8080/ and reaching http://127.0.0.1:9200 cluster: `docker-cluster` (unless you changed env variables). #### Pre-Commit @@ -141,6 +142,10 @@ Typical import time is 45-60 minutes. If you want to skip updates (eg. because you don't have a Redis installed), use `make import-dataset filepath='products.jsonl.gz' args="--skip-updates"` +You should also import taxonomies: + +`make import-taxonomies` + ## Fundings diff --git a/app/api.py b/app/api.py index 6a520402..3f07b169 100644 --- a/app/api.py +++ b/app/api.py @@ -109,6 +109,14 @@ def check_facets_are_valid(index_id: str | None, facets: list[str] | None) -> No raise HTTPException(status_code=400, detail=json.dumps(errors)) +def parse_langs(langs: str | None) -> list[str]: + return langs.split(",") if langs else ["en"] + + +def get_main_lang(langs: list[str] | None) -> str: + return langs[0] if langs else "en" + + @app.get("/search") def search( q: Annotated[ @@ -183,11 +191,13 @@ def search( status_code=400, detail=f"Maximum number of returned results is 10 000 (here: page * page_size = {page * page_size})", ) - langs_list = langs.split(",") if langs else ["en"] + langs_list = parse_langs(langs) + main_lang = get_main_lang(langs_list) # search return app_search.search( q=q, langs=langs_list, + main_lang=main_lang, page_size=page_size, page=page, fields=fields.split(",") if fields else None, diff --git a/app/config.py b/app/config.py index cb55edeb..eac92035 100644 --- a/app/config.py +++ b/app/config.py @@ -185,14 +185,6 @@ class FieldConfig(BaseModel): ), ] = True - @model_validator(mode="after") - def taxonomy_name_should_be_used_for_taxonomy_type_only(self): - """Validator that checks that `taxonomy_name` is only provided for - fields with type `taxonomy`.""" - if self.type is not FieldType.taxonomy and self.taxonomy_name is not None: - raise ValueError("taxonomy_name should be provided for taxonomy type only") - return self - @model_validator(mode="after") def bucket_agg_should_be_used_for_keyword_and_numeric_types_only(self): """Validator that checks that `bucket_agg` is only provided for diff --git a/app/facets.py b/app/facets.py index d45f19f0..9d8978eb 100644 --- a/app/facets.py +++ b/app/facets.py @@ -11,6 +11,7 @@ QueryAnalysis, SearchResponse, ) +from .taxonomy_es import get_taxonomy_names def safe_get_index_config( @@ -44,9 +45,69 @@ def check_all_facets_fields_are_agg( return errors +def _get_translations( + lang: str, items: list[tuple[str, str]], index_config: config.IndexConfig +) -> dict[tuple[str, str], str]: + # go from field_name to taxonomy + field_names = set([field_name for _, field_name in items]) + field_taxonomy: dict[str, str] = { + # note: the `or ""` is only to make typing understand it can't be None + field_name: index_config.fields[field_name].taxonomy_name or "" + for field_name in field_names + if index_config.fields[field_name].taxonomy_name + } + # fetch items names + items_to_fetch = [ + (id, field_taxonomy[field_name]) + for id, field_name in items + if field_name in field_taxonomy + ] + items_names = get_taxonomy_names(items_to_fetch, index_config) + # compute best translations + translations: dict[tuple[str, str], str] = {} + for id, field_name in items: + item_translations = None + names = ( + items_names.get((id, field_taxonomy[field_name])) + if field_name in field_taxonomy + else None + ) + if names: + item_translations = names.get(lang, None) + # fold back to main language for item + if not item_translations: + main_lang = id.split(":", 1)[0] + item_translations = names.get(main_lang, None) + # fold back to english + if not translations: + item_translations = names.get("en", None) + # eventually translate + if item_translations: + translations[(id, field_name)] = item_translations[0] + return translations + + +def translate_facets_values( + lang: str, facets: FacetsInfos, index_config: config.IndexConfig +): + """Translate values of facets""" + # harvest items to translate + items = [ + (item.key, field_name) + for field_name, info in facets.items() + for item in info.items + ] + translations = _get_translations(lang, items, index_config) + # translate facets + for field_name, info in facets.items(): + for item in info.items: + item.name = translations.get((item.key, field_name), item.name) + + def build_facets( search_result: SearchResponse, query_analysis: QueryAnalysis, + lang: str, index_config: config.IndexConfig, facets_names: list[str] | None, ) -> FacetsInfos: @@ -114,8 +175,7 @@ def build_facets( # key="--none--", # # TODO: translate in target language ? # name="None", - # # Note: this depends on search_result.is_count_exact, - # # but we leave it to user to verify + # # Note:translate_facets_values leave it to user to verify # count=search_result.count - items_count, # # FIXME: compute selected ! # selected=False, @@ -136,5 +196,6 @@ def build_facets( items=facet_items, count_error_margin=count_error_margin, ) - + # translate + translate_facets_values(lang, facets, index_config) return facets diff --git a/app/search.py b/app/search.py index 9f29803b..8160ed27 100644 --- a/app/search.py +++ b/app/search.py @@ -40,6 +40,7 @@ def search( page_size: int, fields: list[str] | None, langs: list[str], + main_lang: str, facets: list[str] | None, ) -> SearchResponse: """Run a search""" @@ -83,7 +84,9 @@ def search( page_size=page_size, projection=projection, ) - search_result.facets = build_facets(search_result, query, index_config, facets) + search_result.facets = build_facets( + search_result, query, main_lang, index_config, facets + ) # remove aggregations to avoid sending too much information search_result.aggregations = None return search_result diff --git a/app/taxonomy_es.py b/app/taxonomy_es.py new file mode 100644 index 00000000..5976610b --- /dev/null +++ b/app/taxonomy_es.py @@ -0,0 +1,26 @@ +"""Operations on taxonomies in Elastic Search""" + +from elasticsearch_dsl import Search +from elasticsearch_dsl.query import Q + +from app.config import IndexConfig + + +def get_taxonomy_names( + items: list[tuple[str, str]], + config: IndexConfig, +) -> dict[tuple[str, str], dict[str, list[str]]]: + """Given a set of terms in different taxonomies, return their names""" + filters = [] + for id, taxonomy_name in items: + # match one term + filters.append(Q("term", id=id) & Q("term", taxonomy_name=taxonomy_name)) + query = ( + Search(index=config.taxonomy.index.name) + .filter("bool", should=filters, minimum_should_match=1) + .params(size=len(filters)) + ) + return { + (result.id, result.taxonomy_name): result.names.to_dict() + for result in query.execute().hits + } diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml index 41cc8f25..decf77d3 100644 --- a/data/config/openfoodfacts.yml +++ b/data/config/openfoodfacts.yml @@ -56,20 +56,25 @@ indices: type: text categories_tags: type: keyword + taxonomy_name: category bucket_agg: true labels_tags: type: keyword + taxonomy_name: label bucket_agg: true countries_tags: type: keyword bucket_agg: true + taxonomy_name: country states_tags: type: keyword bucket_agg: true + taxonomy_name: state origins_tags: type: keyword ingredients_tags: type: keyword + taxonomy_name: ingredient unique_scans_n: type: integer scans_n: @@ -93,6 +98,7 @@ indices: type: integer allergens_tags: type: keyword + taxonomy_name: allergen ecoscore_data: type: disabled ecoscore_score: @@ -123,16 +129,6 @@ indices: type: integer completeness: type: float - facets: - # include all fields - default: - order: - - categories_tags - - brands_tags - - labels_tags - - ecoscore_grade - - nova_groups - - ecoscore_grade document_denylist: - '8901552007122' lang_separator: _ diff --git a/tests/unit/data/openfoodfacts_config.yml b/tests/unit/data/openfoodfacts_config.yml index a7eeccb9..4ec1f685 100644 --- a/tests/unit/data/openfoodfacts_config.yml +++ b/tests/unit/data/openfoodfacts_config.yml @@ -56,9 +56,11 @@ indices: type: text categories_tags: type: keyword + taxonomy_name: category bucket_agg: true labels_tags: type: keyword + taxonomy_name: label bucket_agg: true countries_tags: type: keyword