feat: add support for synonyms for taxonomy field types

Also add better documentation and unit tests
openfoodfacts · Nov 27, 2023 · 7481115 · 7481115
1 parent 4df7893
commit 7481115
Show file tree

Hide file tree

Showing 4 changed files with 264 additions and 17 deletions.
diff --git a/app/config.py b/app/config.py
@@ -134,8 +134,9 @@ def is_numeric(self):
 
 
 class FieldConfig(BaseModel):
-    # name of the field (internal field), it's added here for convenience
-    _name: str = ""
+    # name of the field (internal field), it's added here for convenience.
+    # It's set by the `add_field_name_to_each_field` classmethod.
+    name: Annotated[str, Field(description="name of the field, must be unique")] = ""
     type: Annotated[
         FieldType,
         Field(description="type of the field, see `FieldType` for possible values"),
@@ -169,13 +170,19 @@ class FieldConfig(BaseModel):
         ),
     ] = False
     taxonomy_name: Annotated[
-        str | None, Field(description="only for taxonomy field type")
+        str | None,
+        Field(
+            description="the name of the taxonomy associated with this field. "
+            "It must only be provided for taxonomy field type."
+        ),
     ] = None
-
-    @property
-    def name(self) -> str:
-        """Get field name."""
-        return self._name
+    add_taxonomy_synonyms: Annotated[
+        bool,
+        Field(
+            description="if True, add all synonyms of the taxonomy values to the index. "
+            "The flag is ignored if the field type is not `taxonomy`."
+        ),
+    ] = True
 
     @model_validator(mode="after")
     def taxonomy_name_should_be_used_for_taxonomy_type_only(self):
@@ -358,9 +365,9 @@ def field_references_must_exist_and_be_valid(self):
 
     @field_validator("fields")
     @classmethod
-    def add_field_name_to_each_field(cls, fields):
+    def add_field_name_to_each_field(cls, fields: dict[str, FieldConfig]):
         for field_name, field_item in fields.items():
-            field_item._name = field_name
+            field_item.name = field_name
         return fields
 
     def get_supported_langs(self) -> set[str]:

diff --git a/app/indexing.py b/app/indexing.py
@@ -127,6 +127,23 @@ def process_text_lang_field(
     split_separator: str,
     supported_langs: set[str],
 ) -> JSONType | None:
+    """Process data for a `text_lang` field type.
+
+    Generates a dict ready to be indexed by Elasticsearch, with a subfield for
+    each language.
+
+    :param data: input data, as a dict
+    :param input_field: the name of the field to use as input
+    :param split: whether to split the input field value, using
+        `split_separator` as separator
+    :param lang_separator: the separator used to separate the language code
+        from the field name
+    :param split_separator: the separator used to split the input field value,
+        in case of multi-valued input (if `split` is True)
+    :param supported_langs: a set of supported languages (2-letter codes), used
+        to know which sub-fields to create
+    :return: the processed data, as a dict
+    """
     field_input: JSONType = {}
     target_fields = [
         k
@@ -170,6 +187,26 @@ def process_taxonomy_field(
     split_separator: str,
     taxonomy_langs: set[str],
 ) -> JSONType | None:
+    """Process data for a `taxonomy` field type.
+
+    Generates a dict ready to be indexed by Elasticsearch, with a subfield for
+    each language. Two other subfields are added:
+    - `original`: the original value of the field. For example, if the field
+        name is `categories` and `categories` already exist in the document,
+        we will save its value in the `original` subfield. This subfield is
+        only added if the field is present in the input data.
+    - `other`: the value of the field for languages that are not supported by
+        the project (no elasticsearch specific analyzers)
+
+    :param data: input data, as a dict
+    :param field: the field config
+    :param taxonomy_config: the taxonomy config
+    :param split_separator: the separator used to split the input field value,
+        in case of multi-valued input (if `field.split` is True)
+    :param taxonomy_langs: a set of supported languages (2-letter codes), used
+        to know which sub-fields to create.
+    :return: the processed data, as a dict
+    """
     field_input: JSONType = {}
     input_field = field.get_input_field()
     input_value = preprocess_field_value(
@@ -198,11 +235,25 @@ def process_taxonomy_field(
     langs = taxonomy_langs | set(data.get("taxonomy_langs", []))
     for lang in langs:
         for single_tag in input_value:
-            if (value := taxonomy.get_localized_name(single_tag, lang)) is not None:
-                # If language is not supported (=no elasticsearch specific
-                # analyzers), we store the data in a "other" field
-                key = lang if lang in ANALYZER_LANG_MAPPING else "other"
-                field_input.setdefault(key, []).append(value)
+            if single_tag not in taxonomy:
+                continue
+
+            node = taxonomy[single_tag]
+            values = {node.get_localized_name(lang)}
+
+            if field.add_taxonomy_synonyms:
+                values |= set(node.get_synonyms(lang))
+
+                # Add international version of the name
+                if "xx" in node.names:
+                    values |= set(node.get_synonyms("xx"))
+
+            for value in values:
+                if value is not None:
+                    # If language is not supported (=no elasticsearch specific
+                    # analyzers), we store the data in a "other" field
+                    key = lang if lang in ANALYZER_LANG_MAPPING else "other"
+                    field_input.setdefault(key, []).append(value)
 
     if field.name in data:
         field_input["original"] = data[field.name]

diff --git a/tests/unit/data/openfoodfacts_config.yml b/tests/unit/data/openfoodfacts_config.yml
@@ -134,13 +134,68 @@ taxonomy:
     url: https://static.openfoodfacts.org/data/taxonomies/categories.full.json
   - name: label
     url: https://static.openfoodfacts.org/data/taxonomies/labels.full.json
+  - name: additive
+    url: https://static.openfoodfacts.org/data/taxonomies/additives.full.json
+  - name: allergen
+    url: https://static.openfoodfacts.org/data/taxonomies/allergens.full.json
+  - name: amino_acid
+    url: https://static.openfoodfacts.org/data/taxonomies/amino_acids.full.json
+  - name: country
+    url: https://static.openfoodfacts.org/data/taxonomies/countries.full.json
+  - name: data_quality
+    url: https://static.openfoodfacts.org/data/taxonomies/data_quality.full.json
+  - name: food_group
+    url: https://static.openfoodfacts.org/data/taxonomies/food_groups.full.json
+  - name: improvement
+    url: https://static.openfoodfacts.org/data/taxonomies/improvements.full.json
+  - name: ingredient
+    url: https://static.openfoodfacts.org/data/taxonomies/ingredients.full.json
+  - name: ingredients_analysis
+    url: https://static.openfoodfacts.org/data/taxonomies/ingredients_analysis.full.json
+  - name: ingredients_processing
+    url: https://static.openfoodfacts.org/data/taxonomies/ingredients_processing.full.json
+  - name: language
+    url: https://static.openfoodfacts.org/data/taxonomies/languages.full.json
+  - name: mineral
+    url: https://static.openfoodfacts.org/data/taxonomies/minerals.full.json
+  - name: misc
+    url: https://static.openfoodfacts.org/data/taxonomies/misc.full.json
+  - name: nova_group
+    url: https://static.openfoodfacts.org/data/taxonomies/nova_groups.full.json
+  - name: nucleotide
+    url: https://static.openfoodfacts.org/data/taxonomies/nucleotides.full.json
+  - name: nutrient
+    url: https://static.openfoodfacts.org/data/taxonomies/nutrients.full.json
+  - name: origin
+    url: https://static.openfoodfacts.org/data/taxonomies/origins.full.json
+  - name: other_nutritional_substance
+    url: https://static.openfoodfacts.org/data/taxonomies/other_nutritional_substances.full.json
+  - name: packaging_material
+    url: https://static.openfoodfacts.org/data/taxonomies/packaging_materials.full.json
+  - name: packaging_recycling
+    url: https://static.openfoodfacts.org/data/taxonomies/packaging_recycling.full.json
+  - name: packaging_shape
+    url: https://static.openfoodfacts.org/data/taxonomies/packaging_shapes.full.json
+  - name: periods_after_opening
+    url: https://static.openfoodfacts.org/data/taxonomies/periods_after_opening.full.json
+  - name: preservation
+    url: https://static.openfoodfacts.org/data/taxonomies/preservation.full.json
+  - name: state
+    url: https://static.openfoodfacts.org/data/taxonomies/states.full.json
+  - name: vitamin
+    url: https://static.openfoodfacts.org/data/taxonomies/vitamins.full.json
+  - name: brand
+    url: https://static.openfoodfacts.org/data/taxonomies/brands.full.json
   exported_langs:
   - en
   - fr
   - es
   - de
   - it
   - nl
+  index:
+    number_of_replicas: 1
+    number_of_shards: 4
 supported_langs:
 - aa
 - ab

diff --git a/tests/unit/test_indexing.py b/tests/unit/test_indexing.py
@@ -1,6 +1,13 @@
 import pytest
 
-from app.indexing import process_text_lang_field
+from app.config import (
+    FieldConfig,
+    FieldType,
+    TaxonomyConfig,
+    TaxonomyIndexConfig,
+    TaxonomySourceConfig,
+)
+from app.indexing import process_taxonomy_field, process_text_lang_field
 
 
 @pytest.mark.parametrize(
@@ -26,7 +33,16 @@
                 "main": "MAIN",
                 "other": ["VN", "ID"],
             },
-        )
+        ),
+        # Same, but without main language
+        (
+            {
+                "product_name_fr": "FR",
+            },
+            "product_name",
+            False,
+            {"fr": "FR"},
+        ),
     ],
 )
 def test_process_text_lang_field(data, input_field, split, expected):
@@ -44,3 +60,121 @@ def test_process_text_lang_field(data, input_field, split, expected):
         )
         == expected
     )
+
+
+taxonomy_config = TaxonomyConfig(
+    sources=[
+        TaxonomySourceConfig(
+            name="category",
+            url="https://static.openfoodfacts.org/data/taxonomies/categories.full.json",
+        )
+    ],
+    exported_langs=["en"],
+    index=TaxonomyIndexConfig(),
+)
+
+
+@pytest.mark.parametrize(
+    "data, field, taxonomy_config, taxonomy_langs, expected",
+    [
+        (
+            {
+                "taxonomy_langs": ["fr", "it"],
+                # en:edamame has a "xx" name in the taxonomy
+                "categories_tags": "en:beverages,en:alcoholic-beverages,en:not-in-taxonomy,en:edamame",
+                # the original name should be saved under an `original` key
+                "categories": "Boissons,Boissons alcoolisées,Edamame",
+            },
+            FieldConfig(
+                type=FieldType.taxonomy,
+                name="categories",
+                input_field="categories_tags",
+                split=True,
+                add_taxonomy_synonyms=True,
+                taxonomy_name="category",
+            ),
+            taxonomy_config,
+            {"en"},
+            {
+                "fr": [
+                    "Boissons",
+                    "alcool",
+                    "alcools",
+                    "boisson alcoolisée",
+                    "Boissons alcoolisées",
+                    "Edamame",
+                ],
+                "it": ["Bevande", "Bevande alcoliche", "Edamame"],
+                "en": [
+                    "Drinks",
+                    "Beverages",
+                    "Alcoholic beverages",
+                    "drinks with alcohol",
+                    "alcohols",
+                    "Alcoholic drinks",
+                    "Edamame",
+                ],
+                "original": "Boissons,Boissons alcoolisées,Edamame",
+            },
+        ),
+        # Same, but without synonyms
+        (
+            {
+                "taxonomy_langs": ["fr", "it"],
+                "categories_tags": "en:beverages,en:alcoholic-beverages",
+            },
+            FieldConfig(
+                type=FieldType.taxonomy,
+                name="categories",
+                input_field="categories_tags",
+                split=True,
+                add_taxonomy_synonyms=False,
+                taxonomy_name="category",
+            ),
+            taxonomy_config,
+            {"en"},
+            {
+                "fr": [
+                    "Boissons",
+                    "Boissons alcoolisées",
+                ],
+                "it": ["Bevande", "Bevande alcoliche"],
+                "en": [
+                    "Beverages",
+                    "Alcoholic beverages",
+                ],
+            },
+        ),
+        # The field is missing here, we should return None
+        (
+            {"taxonomy_langs": ["fr", "it"]},
+            FieldConfig(
+                type=FieldType.taxonomy,
+                name="categories",
+                input_field="categories_tags",
+                split=True,
+                add_taxonomy_synonyms=False,
+                taxonomy_name="category",
+            ),
+            taxonomy_config,
+            {"en"},
+            None,
+        ),
+    ],
+)
+def test_process_taxonomy_field(data, field, taxonomy_config, taxonomy_langs, expected):
+    split_separator = ","
+    output = process_taxonomy_field(
+        data=data,
+        field=field,
+        taxonomy_config=taxonomy_config,
+        split_separator=split_separator,
+        taxonomy_langs=taxonomy_langs,
+    )
+
+    if expected is None:
+        assert output is None
+    else:
+        assert set(output.keys()) == set(expected.keys())
+        for key in expected.keys():
+            assert set(output[key]) == set(expected[key])