Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: translating facets values using taxonomies #121

Merged
merged 2 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ import-dataset: guard-filepath
@echo "🔎 Importing data …"
${DOCKER_COMPOSE} run --rm api python3 -m app import /opt/search/data/${filepath} ${args} --num-processes=2

import-taxonomies:
@echo "🔎 Importing taxonomies …"
${DOCKER_COMPOSE} run --rm api python3 -m app import-taxonomies ${args}



#-------#
# Tests #
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Now you can go to :
or
- http://localhost:8000/static/off.html to access to lit components search page

To look into the data, you may use elasticvue, going to http://127.0.0.1:8080/ and reaching http://127.0.0.1:9200 cluster: `docker-cluster` (unless you changed env variables).

#### Pre-Commit

Expand Down Expand Up @@ -141,6 +142,10 @@ Typical import time is 45-60 minutes.
If you want to skip updates (eg. because you don't have a Redis installed),
use `make import-dataset filepath='products.jsonl.gz' args="--skip-updates"`

You should also import taxonomies:

`make import-taxonomies`


## Fundings

Expand Down
12 changes: 11 additions & 1 deletion app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ def check_facets_are_valid(index_id: str | None, facets: list[str] | None) -> No
raise HTTPException(status_code=400, detail=json.dumps(errors))


def parse_langs(langs: str | None) -> list[str]:
return langs.split(",") if langs else ["en"]


def get_main_lang(langs: list[str] | None) -> str:
return langs[0] if langs else "en"


@app.get("/search")
def search(
q: Annotated[
Expand Down Expand Up @@ -183,11 +191,13 @@ def search(
status_code=400,
detail=f"Maximum number of returned results is 10 000 (here: page * page_size = {page * page_size})",
)
langs_list = langs.split(",") if langs else ["en"]
langs_list = parse_langs(langs)
main_lang = get_main_lang(langs_list)
# search
return app_search.search(
q=q,
langs=langs_list,
main_lang=main_lang,
page_size=page_size,
page=page,
fields=fields.split(",") if fields else None,
Expand Down
8 changes: 0 additions & 8 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,6 @@ class FieldConfig(BaseModel):
),
] = True

@model_validator(mode="after")
def taxonomy_name_should_be_used_for_taxonomy_type_only(self):
"""Validator that checks that `taxonomy_name` is only provided for
fields with type `taxonomy`."""
if self.type is not FieldType.taxonomy and self.taxonomy_name is not None:
raise ValueError("taxonomy_name should be provided for taxonomy type only")
return self

@model_validator(mode="after")
def bucket_agg_should_be_used_for_keyword_and_numeric_types_only(self):
"""Validator that checks that `bucket_agg` is only provided for
Expand Down
67 changes: 64 additions & 3 deletions app/facets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
QueryAnalysis,
SearchResponse,
)
from .taxonomy_es import get_taxonomy_names


def safe_get_index_config(
Expand Down Expand Up @@ -44,9 +45,69 @@ def check_all_facets_fields_are_agg(
return errors


def _get_translations(
lang: str, items: list[tuple[str, str]], index_config: config.IndexConfig
) -> dict[tuple[str, str], str]:
# go from field_name to taxonomy
field_names = set([field_name for _, field_name in items])
field_taxonomy: dict[str, str] = {
# note: the `or ""` is only to make typing understand it can't be None
field_name: index_config.fields[field_name].taxonomy_name or ""
for field_name in field_names
if index_config.fields[field_name].taxonomy_name
}
# fetch items names
items_to_fetch = [
(id, field_taxonomy[field_name])
for id, field_name in items
if field_name in field_taxonomy
]
items_names = get_taxonomy_names(items_to_fetch, index_config)
# compute best translations
translations: dict[tuple[str, str], str] = {}
for id, field_name in items:
item_translations = None
names = (
items_names.get((id, field_taxonomy[field_name]))
if field_name in field_taxonomy
else None
)
if names:
item_translations = names.get(lang, None)
# fold back to main language for item
if not item_translations:
main_lang = id.split(":", 1)[0]
item_translations = names.get(main_lang, None)
# fold back to english
if not translations:
item_translations = names.get("en", None)
# eventually translate
if item_translations:
translations[(id, field_name)] = item_translations[0]
return translations
alexgarel marked this conversation as resolved.
Show resolved Hide resolved


def translate_facets_values(
lang: str, facets: FacetsInfos, index_config: config.IndexConfig
):
"""Translate values of facets"""
# harvest items to translate
items = [
(item.key, field_name)
for field_name, info in facets.items()
for item in info.items
]
translations = _get_translations(lang, items, index_config)
# translate facets
for field_name, info in facets.items():
for item in info.items:
item.name = translations.get((item.key, field_name), item.name)


def build_facets(
search_result: SearchResponse,
query_analysis: QueryAnalysis,
lang: str,
index_config: config.IndexConfig,
facets_names: list[str] | None,
) -> FacetsInfos:
Expand Down Expand Up @@ -114,8 +175,7 @@ def build_facets(
# key="--none--",
# # TODO: translate in target language ?
# name="None",
# # Note: this depends on search_result.is_count_exact,
# # but we leave it to user to verify
# # Note:translate_facets_values leave it to user to verify
# count=search_result.count - items_count,
# # FIXME: compute selected !
# selected=False,
Expand All @@ -136,5 +196,6 @@ def build_facets(
items=facet_items,
count_error_margin=count_error_margin,
)

# translate
translate_facets_values(lang, facets, index_config)
return facets
5 changes: 4 additions & 1 deletion app/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def search(
page_size: int,
fields: list[str] | None,
langs: list[str],
main_lang: str,
facets: list[str] | None,
) -> SearchResponse:
"""Run a search"""
Expand Down Expand Up @@ -83,7 +84,9 @@ def search(
page_size=page_size,
projection=projection,
)
search_result.facets = build_facets(search_result, query, index_config, facets)
search_result.facets = build_facets(
search_result, query, main_lang, index_config, facets
)
# remove aggregations to avoid sending too much information
search_result.aggregations = None
return search_result
26 changes: 26 additions & 0 deletions app/taxonomy_es.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Operations on taxonomies in Elastic Search"""

from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q

from app.config import IndexConfig


def get_taxonomy_names(
items: list[tuple[str, str]],
config: IndexConfig,
) -> dict[tuple[str, str], dict[str, list[str]]]:
"""Given a set of terms in different taxonomies, return their names"""
filters = []
for id, taxonomy_name in items:
# match one term
filters.append(Q("term", id=id) & Q("term", taxonomy_name=taxonomy_name))
query = (
Search(index=config.taxonomy.index.name)
.filter("bool", should=filters, minimum_should_match=1)
.params(size=len(filters))
)
return {
(result.id, result.taxonomy_name): result.names.to_dict()
for result in query.execute().hits
}
16 changes: 6 additions & 10 deletions data/config/openfoodfacts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,25 @@ indices:
type: text
categories_tags:
type: keyword
taxonomy_name: category
bucket_agg: true
labels_tags:
type: keyword
taxonomy_name: label
bucket_agg: true
countries_tags:
type: keyword
bucket_agg: true
taxonomy_name: country
states_tags:
type: keyword
bucket_agg: true
taxonomy_name: state
origins_tags:
type: keyword
ingredients_tags:
type: keyword
taxonomy_name: ingredient
unique_scans_n:
type: integer
scans_n:
Expand All @@ -93,6 +98,7 @@ indices:
type: integer
allergens_tags:
type: keyword
taxonomy_name: allergen
ecoscore_data:
type: disabled
ecoscore_score:
Expand Down Expand Up @@ -123,16 +129,6 @@ indices:
type: integer
completeness:
type: float
facets:
# include all fields
default:
order:
- categories_tags
- brands_tags
- labels_tags
- ecoscore_grade
- nova_groups
- ecoscore_grade
document_denylist:
- '8901552007122'
lang_separator: _
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/data/openfoodfacts_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ indices:
type: text
categories_tags:
type: keyword
taxonomy_name: category
bucket_agg: true
labels_tags:
type: keyword
taxonomy_name: label
bucket_agg: true
countries_tags:
type: keyword
Expand Down