Skip to content

Commit

Permalink
feat: add support for synonyms for taxonomy field types
Browse files Browse the repository at this point in the history
Also add better documentation and unit tests
  • Loading branch information
raphael0202 committed Nov 27, 2023
1 parent 4df7893 commit 7481115
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 17 deletions.
27 changes: 17 additions & 10 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def is_numeric(self):


class FieldConfig(BaseModel):
# name of the field (internal field), it's added here for convenience
_name: str = ""
# name of the field (internal field), it's added here for convenience.
# It's set by the `add_field_name_to_each_field` classmethod.
name: Annotated[str, Field(description="name of the field, must be unique")] = ""
type: Annotated[
FieldType,
Field(description="type of the field, see `FieldType` for possible values"),
Expand Down Expand Up @@ -169,13 +170,19 @@ class FieldConfig(BaseModel):
),
] = False
taxonomy_name: Annotated[
str | None, Field(description="only for taxonomy field type")
str | None,
Field(
description="the name of the taxonomy associated with this field. "
"It must only be provided for taxonomy field type."
),
] = None

@property
def name(self) -> str:
"""Get field name."""
return self._name
add_taxonomy_synonyms: Annotated[
bool,
Field(
description="if True, add all synonyms of the taxonomy values to the index. "
"The flag is ignored if the field type is not `taxonomy`."
),
] = True

@model_validator(mode="after")
def taxonomy_name_should_be_used_for_taxonomy_type_only(self):
Expand Down Expand Up @@ -358,9 +365,9 @@ def field_references_must_exist_and_be_valid(self):

@field_validator("fields")
@classmethod
def add_field_name_to_each_field(cls, fields):
def add_field_name_to_each_field(cls, fields: dict[str, FieldConfig]):
for field_name, field_item in fields.items():
field_item._name = field_name
field_item.name = field_name
return fields

def get_supported_langs(self) -> set[str]:
Expand Down
61 changes: 56 additions & 5 deletions app/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,23 @@ def process_text_lang_field(
split_separator: str,
supported_langs: set[str],
) -> JSONType | None:
"""Process data for a `text_lang` field type.
Generates a dict ready to be indexed by Elasticsearch, with a subfield for
each language.
:param data: input data, as a dict
:param input_field: the name of the field to use as input
:param split: whether to split the input field value, using
`split_separator` as separator
:param lang_separator: the separator used to separate the language code
from the field name
:param split_separator: the separator used to split the input field value,
in case of multi-valued input (if `split` is True)
:param supported_langs: a set of supported languages (2-letter codes), used
to know which sub-fields to create
:return: the processed data, as a dict
"""
field_input: JSONType = {}
target_fields = [
k
Expand Down Expand Up @@ -170,6 +187,26 @@ def process_taxonomy_field(
split_separator: str,
taxonomy_langs: set[str],
) -> JSONType | None:
"""Process data for a `taxonomy` field type.
Generates a dict ready to be indexed by Elasticsearch, with a subfield for
each language. Two other subfields are added:
- `original`: the original value of the field. For example, if the field
name is `categories` and `categories` already exist in the document,
we will save its value in the `original` subfield. This subfield is
only added if the field is present in the input data.
- `other`: the value of the field for languages that are not supported by
the project (no elasticsearch specific analyzers)
:param data: input data, as a dict
:param field: the field config
:param taxonomy_config: the taxonomy config
:param split_separator: the separator used to split the input field value,
in case of multi-valued input (if `field.split` is True)
:param taxonomy_langs: a set of supported languages (2-letter codes), used
to know which sub-fields to create.
:return: the processed data, as a dict
"""
field_input: JSONType = {}
input_field = field.get_input_field()
input_value = preprocess_field_value(
Expand Down Expand Up @@ -198,11 +235,25 @@ def process_taxonomy_field(
langs = taxonomy_langs | set(data.get("taxonomy_langs", []))
for lang in langs:
for single_tag in input_value:
if (value := taxonomy.get_localized_name(single_tag, lang)) is not None:
# If language is not supported (=no elasticsearch specific
# analyzers), we store the data in a "other" field
key = lang if lang in ANALYZER_LANG_MAPPING else "other"
field_input.setdefault(key, []).append(value)
if single_tag not in taxonomy:
continue

node = taxonomy[single_tag]
values = {node.get_localized_name(lang)}

if field.add_taxonomy_synonyms:
values |= set(node.get_synonyms(lang))

# Add international version of the name
if "xx" in node.names:
values |= set(node.get_synonyms("xx"))

for value in values:
if value is not None:
# If language is not supported (=no elasticsearch specific
# analyzers), we store the data in a "other" field
key = lang if lang in ANALYZER_LANG_MAPPING else "other"
field_input.setdefault(key, []).append(value)

if field.name in data:
field_input["original"] = data[field.name]
Expand Down
55 changes: 55 additions & 0 deletions tests/unit/data/openfoodfacts_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,68 @@ taxonomy:
url: https://static.openfoodfacts.org/data/taxonomies/categories.full.json
- name: label
url: https://static.openfoodfacts.org/data/taxonomies/labels.full.json
- name: additive
url: https://static.openfoodfacts.org/data/taxonomies/additives.full.json
- name: allergen
url: https://static.openfoodfacts.org/data/taxonomies/allergens.full.json
- name: amino_acid
url: https://static.openfoodfacts.org/data/taxonomies/amino_acids.full.json
- name: country
url: https://static.openfoodfacts.org/data/taxonomies/countries.full.json
- name: data_quality
url: https://static.openfoodfacts.org/data/taxonomies/data_quality.full.json
- name: food_group
url: https://static.openfoodfacts.org/data/taxonomies/food_groups.full.json
- name: improvement
url: https://static.openfoodfacts.org/data/taxonomies/improvements.full.json
- name: ingredient
url: https://static.openfoodfacts.org/data/taxonomies/ingredients.full.json
- name: ingredients_analysis
url: https://static.openfoodfacts.org/data/taxonomies/ingredients_analysis.full.json
- name: ingredients_processing
url: https://static.openfoodfacts.org/data/taxonomies/ingredients_processing.full.json
- name: language
url: https://static.openfoodfacts.org/data/taxonomies/languages.full.json
- name: mineral
url: https://static.openfoodfacts.org/data/taxonomies/minerals.full.json
- name: misc
url: https://static.openfoodfacts.org/data/taxonomies/misc.full.json
- name: nova_group
url: https://static.openfoodfacts.org/data/taxonomies/nova_groups.full.json
- name: nucleotide
url: https://static.openfoodfacts.org/data/taxonomies/nucleotides.full.json
- name: nutrient
url: https://static.openfoodfacts.org/data/taxonomies/nutrients.full.json
- name: origin
url: https://static.openfoodfacts.org/data/taxonomies/origins.full.json
- name: other_nutritional_substance
url: https://static.openfoodfacts.org/data/taxonomies/other_nutritional_substances.full.json
- name: packaging_material
url: https://static.openfoodfacts.org/data/taxonomies/packaging_materials.full.json
- name: packaging_recycling
url: https://static.openfoodfacts.org/data/taxonomies/packaging_recycling.full.json
- name: packaging_shape
url: https://static.openfoodfacts.org/data/taxonomies/packaging_shapes.full.json
- name: periods_after_opening
url: https://static.openfoodfacts.org/data/taxonomies/periods_after_opening.full.json
- name: preservation
url: https://static.openfoodfacts.org/data/taxonomies/preservation.full.json
- name: state
url: https://static.openfoodfacts.org/data/taxonomies/states.full.json
- name: vitamin
url: https://static.openfoodfacts.org/data/taxonomies/vitamins.full.json
- name: brand
url: https://static.openfoodfacts.org/data/taxonomies/brands.full.json
exported_langs:
- en
- fr
- es
- de
- it
- nl
index:
number_of_replicas: 1
number_of_shards: 4
supported_langs:
- aa
- ab
Expand Down
138 changes: 136 additions & 2 deletions tests/unit/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import pytest

from app.indexing import process_text_lang_field
from app.config import (
FieldConfig,
FieldType,
TaxonomyConfig,
TaxonomyIndexConfig,
TaxonomySourceConfig,
)
from app.indexing import process_taxonomy_field, process_text_lang_field


@pytest.mark.parametrize(
Expand All @@ -26,7 +33,16 @@
"main": "MAIN",
"other": ["VN", "ID"],
},
)
),
# Same, but without main language
(
{
"product_name_fr": "FR",
},
"product_name",
False,
{"fr": "FR"},
),
],
)
def test_process_text_lang_field(data, input_field, split, expected):
Expand All @@ -44,3 +60,121 @@ def test_process_text_lang_field(data, input_field, split, expected):
)
== expected
)


taxonomy_config = TaxonomyConfig(
sources=[
TaxonomySourceConfig(
name="category",
url="https://static.openfoodfacts.org/data/taxonomies/categories.full.json",
)
],
exported_langs=["en"],
index=TaxonomyIndexConfig(),
)


@pytest.mark.parametrize(
"data, field, taxonomy_config, taxonomy_langs, expected",
[
(
{
"taxonomy_langs": ["fr", "it"],
# en:edamame has a "xx" name in the taxonomy
"categories_tags": "en:beverages,en:alcoholic-beverages,en:not-in-taxonomy,en:edamame",
# the original name should be saved under an `original` key
"categories": "Boissons,Boissons alcoolisées,Edamame",
},
FieldConfig(
type=FieldType.taxonomy,
name="categories",
input_field="categories_tags",
split=True,
add_taxonomy_synonyms=True,
taxonomy_name="category",
),
taxonomy_config,
{"en"},
{
"fr": [
"Boissons",
"alcool",
"alcools",
"boisson alcoolisée",
"Boissons alcoolisées",
"Edamame",
],
"it": ["Bevande", "Bevande alcoliche", "Edamame"],
"en": [
"Drinks",
"Beverages",
"Alcoholic beverages",
"drinks with alcohol",
"alcohols",
"Alcoholic drinks",
"Edamame",
],
"original": "Boissons,Boissons alcoolisées,Edamame",
},
),
# Same, but without synonyms
(
{
"taxonomy_langs": ["fr", "it"],
"categories_tags": "en:beverages,en:alcoholic-beverages",
},
FieldConfig(
type=FieldType.taxonomy,
name="categories",
input_field="categories_tags",
split=True,
add_taxonomy_synonyms=False,
taxonomy_name="category",
),
taxonomy_config,
{"en"},
{
"fr": [
"Boissons",
"Boissons alcoolisées",
],
"it": ["Bevande", "Bevande alcoliche"],
"en": [
"Beverages",
"Alcoholic beverages",
],
},
),
# The field is missing here, we should return None
(
{"taxonomy_langs": ["fr", "it"]},
FieldConfig(
type=FieldType.taxonomy,
name="categories",
input_field="categories_tags",
split=True,
add_taxonomy_synonyms=False,
taxonomy_name="category",
),
taxonomy_config,
{"en"},
None,
),
],
)
def test_process_taxonomy_field(data, field, taxonomy_config, taxonomy_langs, expected):
split_separator = ","
output = process_taxonomy_field(
data=data,
field=field,
taxonomy_config=taxonomy_config,
split_separator=split_separator,
taxonomy_langs=taxonomy_langs,
)

if expected is None:
assert output is None
else:
assert set(output.keys()) == set(expected.keys())
for key in expected.keys():
assert set(output[key]) == set(expected[key])

0 comments on commit 7481115

Please sign in to comment.