Skip to content

Commit

Permalink
feat: translating facets values using taxonomies (#121)
Browse files Browse the repository at this point in the history
fixes: #120
  • Loading branch information
alexgarel authored Jun 6, 2024
1 parent d4f1c9e commit d1162e0
Show file tree
Hide file tree
Showing 9 changed files with 123 additions and 23 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ import-dataset: guard-filepath
@echo "🔎 Importing data …"
${DOCKER_COMPOSE} run --rm api python3 -m app import /opt/search/data/${filepath} ${args} --num-processes=2

import-taxonomies:
@echo "🔎 Importing taxonomies …"
${DOCKER_COMPOSE} run --rm api python3 -m app import-taxonomies ${args}



#-------#
# Tests #
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Now you can go to :
or
- http://localhost:8000/static/off.html to access to lit components search page

To look into the data, you may use elasticvue, going to http://127.0.0.1:8080/ and reaching http://127.0.0.1:9200 cluster: `docker-cluster` (unless you changed env variables).

#### Pre-Commit

Expand Down Expand Up @@ -141,6 +142,10 @@ Typical import time is 45-60 minutes.
If you want to skip updates (eg. because you don't have a Redis installed),
use `make import-dataset filepath='products.jsonl.gz' args="--skip-updates"`
You should also import taxonomies:
`make import-taxonomies`
## Fundings
Expand Down
12 changes: 11 additions & 1 deletion app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ def check_facets_are_valid(index_id: str | None, facets: list[str] | None) -> No
raise HTTPException(status_code=400, detail=json.dumps(errors))


def parse_langs(langs: str | None) -> list[str]:
return langs.split(",") if langs else ["en"]


def get_main_lang(langs: list[str] | None) -> str:
return langs[0] if langs else "en"


@app.get("/search")
def search(
q: Annotated[
Expand Down Expand Up @@ -183,11 +191,13 @@ def search(
status_code=400,
detail=f"Maximum number of returned results is 10 000 (here: page * page_size = {page * page_size})",
)
langs_list = langs.split(",") if langs else ["en"]
langs_list = parse_langs(langs)
main_lang = get_main_lang(langs_list)
# search
return app_search.search(
q=q,
langs=langs_list,
main_lang=main_lang,
page_size=page_size,
page=page,
fields=fields.split(",") if fields else None,
Expand Down
8 changes: 0 additions & 8 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,6 @@ class FieldConfig(BaseModel):
),
] = True

@model_validator(mode="after")
def taxonomy_name_should_be_used_for_taxonomy_type_only(self):
"""Validator that checks that `taxonomy_name` is only provided for
fields with type `taxonomy`."""
if self.type is not FieldType.taxonomy and self.taxonomy_name is not None:
raise ValueError("taxonomy_name should be provided for taxonomy type only")
return self

@model_validator(mode="after")
def bucket_agg_should_be_used_for_keyword_and_numeric_types_only(self):
"""Validator that checks that `bucket_agg` is only provided for
Expand Down
67 changes: 64 additions & 3 deletions app/facets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
QueryAnalysis,
SearchResponse,
)
from .taxonomy_es import get_taxonomy_names


def safe_get_index_config(
Expand Down Expand Up @@ -44,9 +45,69 @@ def check_all_facets_fields_are_agg(
return errors


def _get_translations(
lang: str, items: list[tuple[str, str]], index_config: config.IndexConfig
) -> dict[tuple[str, str], str]:
# go from field_name to taxonomy
field_names = set([field_name for _, field_name in items])
field_taxonomy: dict[str, str] = {
# note: the `or ""` is only to make typing understand it can't be None
field_name: index_config.fields[field_name].taxonomy_name or ""
for field_name in field_names
if index_config.fields[field_name].taxonomy_name
}
# fetch items names
items_to_fetch = [
(id, field_taxonomy[field_name])
for id, field_name in items
if field_name in field_taxonomy
]
items_names = get_taxonomy_names(items_to_fetch, index_config)
# compute best translations
translations: dict[tuple[str, str], str] = {}
for id, field_name in items:
item_translations = None
names = (
items_names.get((id, field_taxonomy[field_name]))
if field_name in field_taxonomy
else None
)
if names:
item_translations = names.get(lang, None)
# fold back to main language for item
if not item_translations:
main_lang = id.split(":", 1)[0]
item_translations = names.get(main_lang, None)
# fold back to english
if not translations:
item_translations = names.get("en", None)
# eventually translate
if item_translations:
translations[(id, field_name)] = item_translations[0]
return translations


def translate_facets_values(
lang: str, facets: FacetsInfos, index_config: config.IndexConfig
):
"""Translate values of facets"""
# harvest items to translate
items = [
(item.key, field_name)
for field_name, info in facets.items()
for item in info.items
]
translations = _get_translations(lang, items, index_config)
# translate facets
for field_name, info in facets.items():
for item in info.items:
item.name = translations.get((item.key, field_name), item.name)


def build_facets(
search_result: SearchResponse,
query_analysis: QueryAnalysis,
lang: str,
index_config: config.IndexConfig,
facets_names: list[str] | None,
) -> FacetsInfos:
Expand Down Expand Up @@ -114,8 +175,7 @@ def build_facets(
# key="--none--",
# # TODO: translate in target language ?
# name="None",
# # Note: this depends on search_result.is_count_exact,
# # but we leave it to user to verify
# # Note:translate_facets_values leave it to user to verify
# count=search_result.count - items_count,
# # FIXME: compute selected !
# selected=False,
Expand All @@ -136,5 +196,6 @@ def build_facets(
items=facet_items,
count_error_margin=count_error_margin,
)

# translate
translate_facets_values(lang, facets, index_config)
return facets
5 changes: 4 additions & 1 deletion app/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def search(
page_size: int,
fields: list[str] | None,
langs: list[str],
main_lang: str,
facets: list[str] | None,
) -> SearchResponse:
"""Run a search"""
Expand Down Expand Up @@ -83,7 +84,9 @@ def search(
page_size=page_size,
projection=projection,
)
search_result.facets = build_facets(search_result, query, index_config, facets)
search_result.facets = build_facets(
search_result, query, main_lang, index_config, facets
)
# remove aggregations to avoid sending too much information
search_result.aggregations = None
return search_result
26 changes: 26 additions & 0 deletions app/taxonomy_es.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Operations on taxonomies in Elastic Search"""

from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q

from app.config import IndexConfig


def get_taxonomy_names(
items: list[tuple[str, str]],
config: IndexConfig,
) -> dict[tuple[str, str], dict[str, list[str]]]:
"""Given a set of terms in different taxonomies, return their names"""
filters = []
for id, taxonomy_name in items:
# match one term
filters.append(Q("term", id=id) & Q("term", taxonomy_name=taxonomy_name))
query = (
Search(index=config.taxonomy.index.name)
.filter("bool", should=filters, minimum_should_match=1)
.params(size=len(filters))
)
return {
(result.id, result.taxonomy_name): result.names.to_dict()
for result in query.execute().hits
}
16 changes: 6 additions & 10 deletions data/config/openfoodfacts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,25 @@ indices:
type: text
categories_tags:
type: keyword
taxonomy_name: category
bucket_agg: true
labels_tags:
type: keyword
taxonomy_name: label
bucket_agg: true
countries_tags:
type: keyword
bucket_agg: true
taxonomy_name: country
states_tags:
type: keyword
bucket_agg: true
taxonomy_name: state
origins_tags:
type: keyword
ingredients_tags:
type: keyword
taxonomy_name: ingredient
unique_scans_n:
type: integer
scans_n:
Expand All @@ -93,6 +98,7 @@ indices:
type: integer
allergens_tags:
type: keyword
taxonomy_name: allergen
ecoscore_data:
type: disabled
ecoscore_score:
Expand Down Expand Up @@ -123,16 +129,6 @@ indices:
type: integer
completeness:
type: float
facets:
# include all fields
default:
order:
- categories_tags
- brands_tags
- labels_tags
- ecoscore_grade
- nova_groups
- ecoscore_grade
document_denylist:
- '8901552007122'
lang_separator: _
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/data/openfoodfacts_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ indices:
type: text
categories_tags:
type: keyword
taxonomy_name: category
bucket_agg: true
labels_tags:
type: keyword
taxonomy_name: label
bucket_agg: true
countries_tags:
type: keyword
Expand Down

0 comments on commit d1162e0

Please sign in to comment.