-
-
Notifications
You must be signed in to change notification settings - Fork 403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: parse origins of ingredients field #6995
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1016,13 +1016,27 @@ sub add_specific_ingredients_from_labels($) { | |
} | ||
|
||
|
||
=head2 parse_specific_ingredients_from_text ( product_ref, $text ) | ||
=head2 parse_specific_ingredients_from_text ( product_ref, $text, $percent_regexp ) | ||
|
||
Lists of ingredients sometime include extra mentions for specific ingredients | ||
at the end of the ingredients list. e.g. "Prepared with 50g of fruits for 100g of finished product". | ||
|
||
This function extracts those mentions and adds them to the specific_ingredients structure. | ||
|
||
This function is also used to parse the origins of ingredients field. | ||
|
||
=head3 Arguments | ||
|
||
=head4 product_ref | ||
|
||
=head4 text $text | ||
|
||
=head4 percent regulart expression $percent_regexp | ||
|
||
Used to find % values, language specific. | ||
|
||
Pass undef in order to skip % recognition. This is useful if we know the text is only for the origins of ingredients. | ||
|
||
=head3 Return values | ||
|
||
=head4 specific_ingredients structure | ||
|
@@ -1062,7 +1076,8 @@ sub parse_specific_ingredients_from_text($$$) { | |
# examples: | ||
# Total Milk Content 73%. | ||
|
||
if ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) { | ||
if ((defined $percent_regexp) | ||
and ($text =~ /\s*(?:total |min |minimum )?([^,.;]+?)\s+content(?::| )+$percent_regexp\s*(?:per 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) { | ||
$percent = $2; # $percent_regexp | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
|
@@ -1071,7 +1086,7 @@ sub parse_specific_ingredients_from_text($$$) { | |
} | ||
|
||
# Origin of the milk: United Kingdom | ||
elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
elsif ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
|
@@ -1081,7 +1096,6 @@ sub parse_specific_ingredients_from_text($$$) { | |
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
} | ||
|
||
} | ||
elsif ($product_lc eq "fr") { | ||
|
||
|
@@ -1090,7 +1104,8 @@ sub parse_specific_ingredients_from_text($$$) { | |
# Teneur en lactose < 0,01 g/100 g. | ||
# Préparée avec 50 g de fruits pour 100 g de produit fini. | ||
|
||
if ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i) { | ||
if ((defined $percent_regexp) | ||
and ($text =~ /\s*(?:(?:préparé|prepare)(?:e|s|es)? avec)(?: au moins)?(?::| )+$percent_regexp (?:de |d')?([^,.;]+?)\s*(?:pour 100\s*(?:g)(?:[^,.;-]*?))?(?:;|\.| - |$)/i)) { | ||
$percent = $1; # $percent_regexp | ||
$ingredient = $2; | ||
$matched_text = $&; | ||
|
@@ -1100,7 +1115,8 @@ sub parse_specific_ingredients_from_text($$$) { | |
|
||
# Teneur totale en sucres : 60 g pour 100 g de produit fini. | ||
# Teneur en citron de 100% | ||
elsif ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i) { | ||
elsif ((defined $percent_regexp) | ||
and ($text =~ /\s*teneur(?: min| minimum| minimale| totale)?(?: en | de | d'| du )([^,.;]+?)\s*(?:pour 100\s*(?:g)(?: de produit(?: fini)?)?)?(?: de)?(?::| )+$percent_regexp\s*(?:pour 100\s*(?:g)(?:[^,.;]*?))?(?:;|\.| - |$)/i)) { | ||
$percent = $2; # $percent_regexp | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
|
@@ -1109,7 +1125,7 @@ sub parse_specific_ingredients_from_text($$$) { | |
} | ||
|
||
# Origine du Cacao: Pérou | ||
elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;]+?)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
elsif ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
|
@@ -1118,6 +1134,8 @@ sub parse_specific_ingredients_from_text($$$) { | |
$matched_text = $&; | ||
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
# Remove extra spaces | ||
$ingredient =~ s/\s+$//; | ||
} | ||
|
||
} | ||
|
@@ -1145,6 +1163,136 @@ sub parse_specific_ingredients_from_text($$$) { | |
} | ||
|
||
|
||
=head2 parse_origins_from_text ( product_ref, $text) | ||
|
||
This function parses the origins of ingredients field to extract the origins of specific ingredients. | ||
The origins are stored in the specific_ingredients structure of the product. | ||
|
||
Note: this function is similar to parse_specific_ingredients_from_text() that operates on ingredients lists. | ||
The difference is that parse_specific_ingredients_from_text() only extracts and recognizes text that is | ||
an extra mention at the end of an ingredient list (e.g. "Origin of strawberries: Spain"), | ||
while parse_origins_from_text() will also recognize text like "Strawberries: Spain". | ||
|
||
=head3 Arguments | ||
|
||
=head4 product_ref | ||
|
||
=head4 text $text | ||
|
||
=head3 Return values | ||
|
||
=head4 specific_ingredients structure | ||
|
||
Array of specific ingredients. | ||
|
||
=head4 | ||
|
||
=cut | ||
|
||
sub parse_origins_from_text($$) { | ||
|
||
my $product_ref = shift; | ||
my $text = shift; | ||
|
||
my $product_lc = $product_ref->{lc}; | ||
|
||
# Go through the ingredient lists multiple times | ||
# as long as we have one match | ||
my $ingredient = "start"; | ||
|
||
while ($ingredient) { | ||
|
||
# Initialize values | ||
$ingredient = undef; | ||
my $matched_text; | ||
my $origins; | ||
|
||
# Note: in regular expressions below, use non-capturing groups (starting with (?: ) | ||
# for all groups, except groups that capture actual data: ingredient name, percent, origins | ||
|
||
# Regexps should match until we reach a . ; or the end of the text | ||
|
||
if ($product_lc eq "en") { | ||
|
||
# Origin of the milk: United Kingdom. | ||
if ($text =~ /\s*(?:origin of (?:the )?)([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm thinking how to avoid repeating that stuff twice in the code. The risk is that we correct it on some place but not the other. Why don't we put this kind of simple condition in a function. It could return undef if no match, else a result as a hash map. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The very least we can do is add those regular expressions as constants or as functions that return them (for the one which needs percent expression). |
||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
$origins = $2; | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
} | ||
# Strawberries: Spain | ||
elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
$origins = $2; | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
} | ||
} | ||
elsif ($product_lc eq "fr") { | ||
|
||
# Origine du Cacao: Pérou | ||
if ($text =~ /\s*(?:origine (?:de |du |de la |des |de l'))([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
$origins = $2; | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
# Remove extra spaces | ||
$ingredient =~ s/\s+$//; | ||
} | ||
# Cacao: Pérou | ||
elsif ($text =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) { | ||
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") | ||
# in order to not overmatch something like "Origin of milk: UK, some other mention." | ||
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. | ||
$origins = $2; | ||
$ingredient = $1; | ||
$matched_text = $&; | ||
# Remove the matched text | ||
$text = $` . ' ' . $'; | ||
# Remove extra spaces | ||
$ingredient =~ s/\s+$//; | ||
} | ||
# TODO: | ||
# Fraises de Bretagne | ||
# Filet de dinde de Vendée | ||
|
||
} | ||
|
||
# If we found an ingredient, save it in specific_ingredients | ||
if (defined $ingredient) { | ||
my $ingredient_id = canonicalize_taxonomy_tag($product_lc, "ingredients", $ingredient); | ||
|
||
$matched_text =~ s/^\s+//; | ||
|
||
my $specific_ingredients_ref = { | ||
id => $ingredient_id, | ||
ingredient => $ingredient, | ||
text => $matched_text, | ||
}; | ||
|
||
defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins )); | ||
|
||
push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref; | ||
} | ||
} | ||
|
||
return $text; | ||
} | ||
|
||
|
||
=head2 parse_ingredients_text ( product_ref ) | ||
|
||
Parse the ingredients_text field to extract individual ingredients. | ||
|
@@ -2165,15 +2313,28 @@ sub extract_ingredients_from_text($) { | |
|
||
delete $product_ref->{ingredients_percent_analysis}; | ||
|
||
# Parse the ingredients list to extract individual ingredients and sub-ingredients | ||
# to create the ingredients array with nested sub-ingredients arrays | ||
# The specific ingredients array will contain indications regarding the percentage, | ||
# origins, labels etc. of specific ingredients. Those information may come from: | ||
# - the origin of ingredients field ("origin") | ||
# - labels (e.g. "British eggs") | ||
# - the end of the list of the ingredients. e.g. "Origin of the rice: Thailand" | ||
|
||
$product_ref->{specific_ingredients} = []; | ||
|
||
parse_ingredients_text($product_ref); | ||
# Ingredients origins may be listed in the origin field | ||
# e.g. "Origin of the rice: Thailand." | ||
my $product_lc = $product_ref->{lc}; | ||
if (defined $product_ref->{"origin_" . $product_lc}) { | ||
parse_origins_from_text($product_ref, $product_ref->{"origin_" . $product_lc}); | ||
} | ||
|
||
# Add specific ingredients from labels | ||
add_specific_ingredients_from_labels($product_ref); | ||
add_specific_ingredients_from_labels($product_ref); | ||
|
||
# Parse the ingredients list to extract individual ingredients and sub-ingredients | ||
# to create the ingredients array with nested sub-ingredients arrays | ||
|
||
parse_ingredients_text($product_ref); | ||
|
||
if (defined $product_ref->{ingredients}) { | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.