diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index a316757dba255..d622430a390e1 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -1016,13 +1016,27 @@ sub add_specific_ingredients_from_labels($) { } -=head2 parse_specific_ingredients_from_text ( product_ref, $text ) +=head2 parse_specific_ingredients_from_text ( product_ref, $text, $percent_regexp ) Lists of ingredients sometime include extra mentions for specific ingredients at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product". This function extracts those mentions and adds them to the specific_ingredients structure. +This function is also used to parse the origins of ingredients field. + +=head3 Arguments + +=head4 product_ref + +=head4 text $text + +=head4 percent regulart expression $percent_regexp + +Used to find % values, language specific. + +Pass undef in order to skip % recognition. This is useful if we know the text is only for the origins of ingredients. + =head3 Return values =head4 specific_ingredients structure @@ -1062,7 +1076,8 @@ sub parse_specific_ingredients_from_text($$$) { # examples: # Total Milk Content 73%. - if ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) { + if ((defined $percent_regexp) + and ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) { $percent = $2; # $percent_regexp $ingredient = $1; $matched_text = $&; @@ -1071,7 +1086,7 @@ sub parse_specific_ingredients_from_text($$$) { } # Origin of the milk: United Kingdom - elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { + elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") # in order to not overmatch something like "Origin of milk: UK, some other mention." # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. @@ -1081,7 +1096,6 @@ sub parse_specific_ingredients_from_text($$$) { # Remove the matched text $text = $` . ' ' . $'; } - } elsif ($product_lc eq "fr") { @@ -1090,7 +1104,8 @@ sub parse_specific_ingredients_from_text($$$) { # Teneur en lactose < 0,01 g/100 g. # Préparée avec 50 g de fruits pour 100 g de produit fini. - if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) { + if ((defined $percent_regexp) + and ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) { $percent = $1; # $percent_regexp $ingredient = $2; $matched_text = $&; @@ -1100,7 +1115,8 @@ sub parse_specific_ingredients_from_text($$$) { # Teneur totale en sucres : 60 g pour 100 g de produit fini. # Teneur en citron de 100% - elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i) { + elsif ((defined $percent_regexp) + and ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i)) { $percent = $2; # $percent_regexp $ingredient = $1; $matched_text = $&; @@ -1109,7 +1125,7 @@ sub parse_specific_ingredients_from_text($$$) { } # Origine du Cacao: Pérou - elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { + elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") # in order to not overmatch something like "Origin of milk: UK, some other mention." # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. @@ -1118,6 +1134,8 @@ sub parse_specific_ingredients_from_text($$$) { $matched_text = $&; # Remove the matched text $text = $` . ' ' . $'; + # Remove extra spaces + $ingredient =~ s/\s+$//; } } @@ -1145,6 +1163,136 @@ sub parse_specific_ingredients_from_text($$$) { } +=head2 parse_origins_from_text ( product_ref, $text) + +This function parses the origins of ingredients field to extract the origins of specific ingredients. +The origins are stored in the specific_ingredients structure of the product. + +Note: this function is similar to parse_specific_ingredients_from_text() that operates on ingredients lists. +The difference is that parse_specific_ingredients_from_text() only extracts and recognizes text that is +an extra mention at the end of an ingredient list (e.g. "Origin of strawberries: Spain"), +while parse_origins_from_text() will also recognize text like "Strawberries: Spain". + +=head3 Arguments + +=head4 product_ref + +=head4 text $text + +=head3 Return values + +=head4 specific_ingredients structure + +Array of specific ingredients. + +=head4 + +=cut + +sub parse_origins_from_text($$) { + + my $product_ref = shift; + my $text = shift; + + my $product_lc = $product_ref->{lc}; + + # Go through the ingredient lists multiple times + # as long as we have one match + my $ingredient = "start"; + + while ($ingredient) { + + # Initialize values + $ingredient = undef; + my $matched_text; + my $origins; + + # Note: in regular expressions below, use non-capturing groups (starting with (?: ) + # for all groups, except groups that capture actual data: ingredient name, percent, origins + + # Regexps should match until we reach a . ; or the end of the text + + if ($product_lc eq "en") { + + # Origin of the milk: United Kingdom. + if ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { + # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") + # in order to not overmatch something like "Origin of milk: UK, some other mention." + # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + $origins = $2; + $ingredient = $1; + $matched_text = $&; + # Remove the matched text + $text = $` . ' ' . $'; + } + # Strawberries: Spain + elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) { + # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") + # in order to not overmatch something like "Origin of milk: UK, some other mention." + # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + $origins = $2; + $ingredient = $1; + $matched_text = $&; + # Remove the matched text + $text = $` . ' ' . $'; + } + } + elsif ($product_lc eq "fr") { + + # Origine du Cacao: Pérou + if ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { + # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") + # in order to not overmatch something like "Origin of milk: UK, some other mention." + # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + $origins = $2; + $ingredient = $1; + $matched_text = $&; + # Remove the matched text + $text = $` . ' ' . $'; + # Remove extra spaces + $ingredient =~ s/\s+$//; + } + # Cacao: Pérou + elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) { + # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") + # in order to not overmatch something like "Origin of milk: UK, some other mention." + # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + $origins = $2; + $ingredient = $1; + $matched_text = $&; + # Remove the matched text + $text = $` . ' ' . $'; + # Remove extra spaces + $ingredient =~ s/\s+$//; + } + # TODO: + # Fraises de Bretagne + # Filet de dinde de Vendée + + } + + # If we found an ingredient, save it in specific_ingredients + if (defined $ingredient) { + my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient); + + $matched_text =~ s/^\s+//; + + my $specific_ingredients_ref = { + id => $ingredient_id, + ingredient => $ingredient, + text => $matched_text, + }; + + defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins )); + + push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref; + } + } + + return $text; +} + + =head2 parse_ingredients_text ( product_ref ) Parse the ingredients_text field to extract individual ingredients. @@ -2165,15 +2313,28 @@ sub extract_ingredients_from_text($) { delete $product_ref->{ingredients_percent_analysis}; - # Parse the ingredients list to extract individual ingredients and sub-ingredients - # to create the ingredients array with nested sub-ingredients arrays + # The specific ingredients array will contain indications regarding the percentage, + # origins, labels etc. of specific ingredients. Those information may come from: + # - the origin of ingredients field ("origin") + # - labels (e.g. "British eggs") + # - the end of the list of the ingredients. e.g. "Origin of the rice: Thailand" $product_ref->{specific_ingredients} = []; - parse_ingredients_text($product_ref); + # Ingredients origins may be listed in the origin field + # e.g. "Origin of the rice: Thailand." + my $product_lc = $product_ref->{lc}; + if (defined $product_ref->{"origin_" . $product_lc}) { + parse_origins_from_text($product_ref, $product_ref->{"origin_" . $product_lc}); + } # Add specific ingredients from labels - add_specific_ingredients_from_labels($product_ref); + add_specific_ingredients_from_labels($product_ref); + + # Parse the ingredients list to extract individual ingredients and sub-ingredients + # to create the ingredients array with nested sub-ingredients arrays + + parse_ingredients_text($product_ref); if (defined $product_ref->{ingredients}) { diff --git a/t/expected_test_results/ingredients/en-origin-field.json b/t/expected_test_results/ingredients/en-origin-field.json new file mode 100644 index 0000000000000..c5166446af57b --- /dev/null +++ b/t/expected_test_results/ingredients/en-origin-field.json @@ -0,0 +1,146 @@ +{ + "ingredients" : [ + { + "id" : "en:strawberry", + "origins" : "en:spain", + "percent_estimate" : 58.3333333333333, + "percent_max" : 100, + "percent_min" : 16.6666666666667, + "text" : "Strawberries", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:raspberry", + "origins" : "en:new-caledonia", + "percent_estimate" : 20.8333333333333, + "percent_max" : 50, + "percent_min" : 0, + "text" : "raspberries", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:blueberry", + "origins" : "en:canada", + "percent_estimate" : 10.4166666666667, + "percent_max" : 33.3333333333333, + "percent_min" : 0, + "text" : "blueberries", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:gooseberry", + "percent_estimate" : 5.20833333333333, + "percent_max" : 25, + "percent_min" : 0, + "text" : "gooseberries", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:white-peach", + "origins" : "en:mexico", + "percent_estimate" : 2.60416666666666, + "percent_max" : 20, + "percent_min" : 0, + "text" : "white peaches", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:bell-pepper", + "origins" : "en:guatemala", + "percent_estimate" : 2.60416666666666, + "percent_max" : 16.6666666666667, + "percent_min" : 0, + "text" : "bell peppers", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : {}, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:strawberry", + "en:fruit", + "en:berries", + "en:raspberry", + "en:blueberry", + "en:gooseberry", + "en:white-peach", + "en:peach", + "en:bell-pepper", + "en:vegetable" + ], + "ingredients_n" : 6, + "ingredients_n_tags" : [ + "6", + "1-10" + ], + "ingredients_original_tags" : [ + "en:strawberry", + "en:raspberry", + "en:blueberry", + "en:gooseberry", + "en:white-peach", + "en:bell-pepper" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:strawberry", + "en:fruit", + "en:berries", + "en:raspberry", + "en:blueberry", + "en:gooseberry", + "en:white-peach", + "en:peach", + "en:bell-pepper", + "en:vegetable" + ], + "ingredients_text" : "Strawberries (Spain), raspberries, blueberries, gooseberries, white peaches, bell peppers. Origin of bell peppers: Guatemala", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 6, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 10, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 16.6666666666667, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 16.6666666666667 + }, + "origin_en" : "Origin of raspberries: New Caledonia. Blueberries: Canada ; White peaches : Mexico", + "specific_ingredients" : [ + { + "id" : "en:raspberry", + "ingredient" : "raspberries", + "origins" : "en:new-caledonia", + "text" : "Origin of raspberries: New Caledonia." + }, + { + "id" : "en:blueberry", + "ingredient" : "Blueberries", + "origins" : "en:canada", + "text" : "Blueberries: Canada ;" + }, + { + "id" : "en:white-peach", + "ingredient" : "White peaches ", + "origins" : "en:mexico", + "text" : "White peaches : Mexico" + }, + { + "id" : "en:bell-pepper", + "ingredient" : "bell peppers", + "origins" : "en:guatemala", + "text" : "Origin of bell peppers: Guatemala" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/t/expected_test_results/ingredients/fr-origin-field.json b/t/expected_test_results/ingredients/fr-origin-field.json new file mode 100644 index 0000000000000..ad441b39e505e --- /dev/null +++ b/t/expected_test_results/ingredients/fr-origin-field.json @@ -0,0 +1,245 @@ +{ + "ingredients" : [ + { + "id" : "fr:Coquillettes", + "percent_estimate" : 54.5454545454545, + "percent_max" : 100, + "percent_min" : 9.09090909090909, + "text" : "Coquillettes" + }, + { + "id" : "en:comte", + "percent_estimate" : 22.7272727272727, + "percent_max" : 50, + "percent_min" : 0, + "text" : "comté", + "vegan" : "no", + "vegetarian" : "maybe" + }, + { + "id" : "fr:jambon supérieur", + "origins" : "en:france", + "percent_estimate" : 11.3636363636364, + "percent_max" : 33.3333333333333, + "percent_min" : 0, + "text" : "jambon supérieur" + }, + { + "id" : "en:white-wine", + "origins" : "en:europe", + "percent_estimate" : 5.68181818181818, + "percent_max" : 25, + "percent_min" : 0, + "text" : "vin blanc", + "vegan" : "maybe", + "vegetarian" : "yes" + }, + { + "id" : "en:red-wine", + "origins" : "en:italy", + "percent_estimate" : 2.84090909090909, + "percent_max" : 20, + "percent_min" : 0, + "text" : "vin rouge", + "vegan" : "maybe", + "vegetarian" : "yes" + }, + { + "id" : "en:rose-wine", + "origins" : "en:spain", + "percent_estimate" : 1.42045454545455, + "percent_max" : 16.6666666666667, + "percent_min" : 0, + "text" : "vin rosé", + "vegan" : "maybe", + "vegetarian" : "yes" + }, + { + "id" : "en:uht-cream", + "percent_estimate" : 0.710227272727273, + "percent_max" : 14.2857142857143, + "percent_min" : 0, + "text" : "crème UHT", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:parmesan", + "origins" : "en:italy", + "percent_estimate" : 0.35511363636364, + "percent_max" : 12.5, + "percent_min" : 0, + "text" : "parmesan", + "vegan" : "no", + "vegetarian" : "maybe" + }, + { + "id" : "en:ricotta", + "origins" : "en:italy", + "percent_estimate" : 0.17755681818182, + "percent_max" : 11.1111111111111, + "percent_min" : 0, + "text" : "ricotta", + "vegan" : "no", + "vegetarian" : "maybe" + }, + { + "id" : "en:salt", + "percent_estimate" : 0.0887784090909065, + "percent_max" : 10, + "percent_min" : 0, + "text" : "sel", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:pepper", + "origins" : "en:nepal", + "percent_estimate" : 0.0887784090909065, + "percent_max" : 9.09090909090909, + "percent_min" : 0, + "text" : "poivre", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : { + "en:non-vegan" : [ + "en:comte", + "en:uht-cream", + "en:parmesan", + "en:ricotta" + ], + "en:palm-oil-content-unknown" : [ + "fr:Coquillettes", + "fr:jambon supérieur" + ], + "en:vegan-status-unknown" : [ + "fr:Coquillettes", + "fr:jambon supérieur" + ], + "en:vegetarian-status-unknown" : [ + "fr:Coquillettes", + "fr:jambon supérieur" + ] + }, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:non-vegan", + "en:vegetarian-status-unknown" + ], + "ingredients_hierarchy" : [ + "fr:Coquillettes", + "en:comte", + "en:dairy", + "en:cheese", + "fr:jambon supérieur", + "en:white-wine", + "en:alcohol", + "en:wine", + "en:red-wine", + "en:rose-wine", + "en:uht-cream", + "en:cream", + "en:parmesan", + "en:ricotta", + "en:salt", + "en:pepper", + "en:seed" + ], + "ingredients_n" : 11, + "ingredients_n_tags" : [ + "11", + "11-20" + ], + "ingredients_original_tags" : [ + "fr:Coquillettes", + "en:comte", + "fr:jambon supérieur", + "en:white-wine", + "en:red-wine", + "en:rose-wine", + "en:uht-cream", + "en:parmesan", + "en:ricotta", + "en:salt", + "en:pepper" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "fr:coquillettes", + "en:comte", + "en:dairy", + "en:cheese", + "fr:jambon-superieur", + "en:white-wine", + "en:alcohol", + "en:wine", + "en:red-wine", + "en:rose-wine", + "en:uht-cream", + "en:cream", + "en:parmesan", + "en:ricotta", + "en:salt", + "en:pepper", + "en:seed" + ], + "ingredients_text" : "Coquillettes, comté, jambon supérieur, vin blanc, vin rouge (italie), vin rosé (origine : Espagne), crème UHT, parmesan, ricotta (origine Italie), sel, poivre. Origine du poivre: Népal.", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 11, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 15, + "lc" : "fr", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "origin_fr" : "Origine des coquillettes : Italie. Origine du Comté AOP 4 mois : France. Origine du jambon supérieur : France. Vin blanc : Europe. Origine Crème UHT : France. Origine du parmesan : Italie. Fabriqué en France. Tomates d'Italie. Origine du riz : Inde, Thaïlande.", + "specific_ingredients" : [ + { + "id" : "fr:coquillettes", + "ingredient" : "coquillettes", + "origins" : "en:italy", + "text" : "Origine des coquillettes : Italie." + }, + { + "id" : "fr:Comté AOP 4 mois", + "ingredient" : "Comté AOP 4 mois", + "origins" : "en:france", + "text" : "Origine du Comté AOP 4 mois : France." + }, + { + "id" : "fr:jambon supérieur", + "ingredient" : "jambon supérieur", + "origins" : "en:france", + "text" : "Origine du jambon supérieur : France." + }, + { + "id" : "en:parmesan", + "ingredient" : "parmesan", + "origins" : "en:italy", + "text" : "Origine du parmesan : Italie." + }, + { + "id" : "en:white-wine", + "ingredient" : "Vin blanc", + "origins" : "en:europe", + "text" : "Vin blanc : Europe." + }, + { + "id" : "fr:Origine Crème UHT", + "ingredient" : "Origine Crème UHT", + "origins" : "en:france", + "text" : "Origine Crème UHT : France." + }, + { + "id" : "en:pepper", + "ingredient" : "poivre", + "origins" : "en:nepal", + "text" : "Origine du poivre: Népal" + } + ], + "unknown_ingredients_n" : 2 +} diff --git a/t/ingredients.t b/t/ingredients.t index 1ea8bc4b99921..2c49956acd2ab 100755 --- a/t/ingredients.t +++ b/t/ingredients.t @@ -422,8 +422,28 @@ Teneur en citron de 5,5%", lc => "en", ingredients_text => "milk, some unknown ingredient, another unknown ingredient, salt, sugar, pepper, spices, water", } + ], + + # origins field + # also test an ingredient with 2 words: bell peppers, which used to break. + [ + "en-origin-field", + { + lc => "en", + ingredients_text => "Strawberries (Spain), raspberries, blueberries, gooseberries, white peaches, bell peppers. Origin of bell peppers: Guatemala", + origin_en => "Origin of raspberries: New Caledonia. Blueberries: Canada ; White peaches : Mexico", + } ], + # origins field + [ + "fr-origin-field", + { + lc => "fr", + ingredients_text => "Coquillettes, comté, jambon supérieur, vin blanc, vin rouge (italie), vin rosé (origine : Espagne), crème UHT, parmesan, ricotta (origine Italie), sel, poivre. Origine du poivre: Népal.", + origin_fr => "Origine des coquillettes : Italie. Origine du Comté AOP 4 mois : France. Origine du jambon supérieur : France. Vin blanc : Europe. Origine Crème UHT : France. Origine du parmesan : Italie. Fabriqué en France. Tomates d'Italie. Origine du riz : Inde, Thaïlande.", + } + ], );