Skip to content

Commit

Permalink
fix: letter A at end of string is not a stopword (#11095)
Browse files Browse the repository at this point in the history
I think this may be the reason for
#11014

E120 and E124 have synonyms "Cochineal Red" and "Cochineal Red A".
In the ingredients taxonomy, "a" is a stopword.
E124 becomes a synonym of E120.

Solution: we don't consider A / a to be a stopword when it is at the end
of a string, as in English, "a" comes before a noun.
  • Loading branch information
stephanegigandet authored Dec 6, 2024
1 parent 96b2818 commit 6eaeb26
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 1 deletion.
9 changes: 8 additions & 1 deletion lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,13 @@ sub remove_stopwords ($tagtype, $lc, $tagid) {

my $uppercased_stopwords_overrides = 0;

if ($lc eq 'en') {
# in English, "a" is a stopwords for ingredients, but we do not want to remove it at the end of a tag
# e.g. "Cochineal Red A" -> "cochineal-red-a" --> "a" should not be a stopword
$tagid =~ s/a$/A/;
$uppercased_stopwords_overrides = 1;
}

if ($lc eq 'fr') {
# "Dés de tomates" -> "des-de-tomates" --> "dés" should not be a stopword
$tagid =~ s/\bdes-de\b/DES-DE/g;
Expand Down Expand Up @@ -1099,7 +1106,7 @@ sub get_file_from_cache ($source, $target) {
# e.g. if the taxonomy building algorithm or configuration has changed
# This needs to be done also when the unaccenting parameters for languages set in Config.pm are changed

my $BUILD_TAGS_VERSION = "20240828 - new [tagtype].extended.json format with normalized extended synonyms";
my $BUILD_TAGS_VERSION = "20241206 - the letter A at the end of an entry should not be a stopword in English";

sub get_from_cache ($tagtype, @files) {
# If the full set of cached files can't be found then returns the hash to be used
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
{
"ingredients" : [
{
"id" : "en:e124",
"is_in_taxonomy" : 1,
"percent_estimate" : 60,
"percent_max" : 100,
"percent_min" : 20,
"text" : "e124",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:e124",
"is_in_taxonomy" : 1,
"percent_estimate" : 20,
"percent_max" : 50,
"percent_min" : 0,
"text" : "Ponceau 4R",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:e124",
"is_in_taxonomy" : 1,
"percent_estimate" : 10,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"text" : "Cochineal Red A",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:e120",
"is_in_taxonomy" : 1,
"percent_estimate" : 5,
"percent_max" : 25,
"percent_min" : 0,
"text" : "Cochineal Red",
"vegan" : "no",
"vegetarian" : "no"
},
{
"ecobalyse_code" : "pear-eu",
"id" : "en:pear",
"is_in_taxonomy" : 1,
"percent_estimate" : 5,
"percent_max" : 20,
"percent_min" : 0,
"text" : "a pear",
"vegan" : "yes",
"vegetarian" : "yes"
}
],
"ingredients_analysis" : {
"en:non-vegan" : [
"en:e120"
],
"en:non-vegetarian" : [
"en:e120"
]
},
"ingredients_analysis_tags" : [
"en:palm-oil-free",
"en:non-vegan",
"en:non-vegetarian"
],
"ingredients_hierarchy" : [
"en:e124",
"en:e120",
"en:pear",
"en:fruit"
],
"ingredients_lc" : "en",
"ingredients_n" : 5,
"ingredients_n_tags" : [
"5",
"1-10"
],
"ingredients_original_tags" : [
"en:e124",
"en:e124",
"en:e124",
"en:e120",
"en:pear"
],
"ingredients_percent_analysis" : 1,
"ingredients_tags" : [
"en:e124",
"en:e120",
"en:pear",
"en:fruit"
],
"ingredients_text" : "E124, Ponceau 4R, Cochineal Red A, Cochineal Red, a pear",
"ingredients_with_specified_percent_n" : 0,
"ingredients_with_specified_percent_sum" : 0,
"ingredients_with_unspecified_percent_n" : 5,
"ingredients_with_unspecified_percent_sum" : 100,
"ingredients_without_ciqual_codes" : [
"en:e120",
"en:e124",
"en:pear"
],
"ingredients_without_ciqual_codes_n" : 3,
"ingredients_without_ecobalyse_ids" : [
"en:e120",
"en:e124"
],
"ingredients_without_ecobalyse_ids_n" : 2,
"known_ingredients_n" : 4,
"lc" : "en",
"nutriments" : {
"fruits-vegetables-legumes-estimate-from-ingredients_100g" : 5,
"fruits-vegetables-legumes-estimate-from-ingredients_serving" : 5,
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 5,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 5
},
"unknown_ingredients_n" : 0
}
11 changes: 11 additions & 0 deletions tests/unit/ingredients.t
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,17 @@ puffed orange and caramelized unknown_fruit4.",
ingredients_text => "some unknown ingredient and salt",
}
],

# Do not consider A at the end of the string to be a stopword
# https://github.com/openfoodfacts/openfoodfacts-server/pull/11095
[
"en-ingredient-ending-with-a",
{
lc => "en",
ingredients_text => "E124, Ponceau 4R, Cochineal Red A, Cochineal Red, a pear",
}
],

);

foreach my $test_ref (@tests) {
Expand Down

0 comments on commit 6eaeb26

Please sign in to comment.