Skip to content

Commit

Permalink
feat: improved parsing of origins of ingredients (#7398)
Browse files Browse the repository at this point in the history
* refactor of tags regexps used in Ingredients.pm + improved parsing
  • Loading branch information
stephanegigandet authored Sep 27, 2022
1 parent 2c09c66 commit f14ca5a
Show file tree
Hide file tree
Showing 17 changed files with 977 additions and 126 deletions.
2 changes: 1 addition & 1 deletion docker/jslint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ services:
- ./.stylelintignore:/opt/product-opener/.stylelintignore
# extra files to lint
- ./scripts:/opt/product-opener/scripts
- ./test:/opt/product-opener/test
- ./tests:/opt/product-opener/tests
121 changes: 54 additions & 67 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -526,39 +526,35 @@ my %ingredients_processing_regexps = ();

sub init_ingredients_processing_regexps() {

foreach my $ingredients_processing ( keys %{ $translations_to{ingredients_processing} } ) {
# Create a list of regexps with each synonyms of all ingredients processes
%ingredients_processing_regexps = %{
generate_regexps_matching_taxonomy_entries("ingredients_processing", "list_of_regexps",
{
#add_simple_plurals => 1,
#add_simple_singulars => 1,
match_space_with_dash => 1,
}
)
};

foreach my $l ( keys %{ $translations_to{ingredients_processing}{$ingredients_processing} } ) {
return;
}

defined $ingredients_processing_regexps{$l} or $ingredients_processing_regexps{$l} = [];

# the synonyms below also contain the main translation as the first entry
# Origins processing regexps

my $l_ingredients_processing = get_string_id_for_lang($l, $translations_to{ingredients_processing}{$ingredients_processing}{$l});
my %origins_regexps = ();

foreach my $synonym ( @{$synonyms_for{ingredients_processing}{$l}{$l_ingredients_processing}} ) {
# Make spaces match dashes and the reverse
$synonym =~ s/( |-)/\(\?: \|-\)/g;
push @{ $ingredients_processing_regexps{$l} },
[ $ingredients_processing, $synonym ];
sub init_origins_regexps() {

if ( ( my $unacc = unac_string_perl($synonym) ) ne $synonym ) {
push @{ $ingredients_processing_regexps{$l} },
[ $ingredients_processing, $unacc ];
}
# Create a list of regexps with each synonyms of all ingredients processes
%origins_regexps = %{
generate_regexps_matching_taxonomy_entries("origins", "unique_regexp",
{
match_space_with_dash => 1,
}
}
}

# We want to match the longest strings first
# Unfortunately, the following does not work:
# my $regexp = join('|', sort { length($b) <=> length($a) } keys %synonyms);
# -> if we have (gehackte|gehackt) and we parse "gehackte something", it will match "gehackt".
foreach my $lc ( keys %ingredients_processing_regexps ) {
@{ $ingredients_processing_regexps{$lc} }
= sort { length $b->[1] <=> length $a->[1] }
@{ $ingredients_processing_regexps{$lc} };
}
)
};

return;
}
Expand All @@ -571,43 +567,23 @@ my %additives_classes_regexps = ();
sub init_additives_classes_regexps() {

# Create a regexp with all synonyms of all additives classes
my %additives_classes_synonyms = ();

foreach my $additives_class (keys %{$translations_to{additives_classes}}) {

# do not turn vitamin a in vitamin : a-z
next if $additives_class eq "en:vitamins";

foreach my $l (keys %{$translations_to{additives_classes}{$additives_class}}) {

defined $additives_classes_synonyms{$l} or $additives_classes_synonyms{$l} = {};

# the synonyms below also contain the main translation as the first entry

my $l_additives_class = get_string_id_for_lang($l, $translations_to{additives_classes}{$additives_class}{$l});

foreach my $synonym (@{$synonyms_for{additives_classes}{$l}{$l_additives_class}}) {
$additives_classes_synonyms{$l}{$synonym} = 1;
# simple singulars and plurals + unaccented forms
$additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1;
$synonym =~ s/s$//;
$additives_classes_synonyms{$l}{$synonym} = 1;
$additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1;
$additives_classes_synonyms{$l}{$synonym . "s"} = 1;
$additives_classes_synonyms{$l}{unac_string_perl($synonym . "s")} = 1;
%additives_classes_regexps = %{
generate_regexps_matching_taxonomy_entries("additives_classes", "unique_regexp",
{
add_simple_plurals => 1,
add_simple_singulars => 1,
# 2022-09-22: not sure if the following is still needed
# before refactoring, we had a comment about not turning
# "vitamin A" into "vitamin : A", but it does not happen
# skip_entries_matching => '/^en:vitamins$/',
}
}
}

foreach my $l (sort keys %additives_classes_synonyms) {
# Match the longest strings first
$additives_classes_regexps{$l} = join('|', sort { length($b) <=> length($a) } keys %{$additives_classes_synonyms{$l}});
# print STDERR "additives_classes_regexps{$l}: " . $additives_classes_regexps{$l} . "\n";
}
)
};

return;
}


if ((keys %labels_regexps) > 0) { exit; }

# load ingredients classes
Expand Down Expand Up @@ -1125,8 +1101,9 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) {
text => $matched_text,
};

my $and_or = $and_or{$product_lc};
defined $percent and $specific_ingredients_ref->{percent} = $percent + 0;
defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins ));
defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $origins ));

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
}
Expand All @@ -1143,8 +1120,13 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) {

sub match_ingredient_origin($product_lc, $text_ref, $matched_ingredient_ref) {

# Strawberries: Spain
if ($$text_ref =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
my $origins_regexp = $origins_regexps{$product_lc};
my $and_or = $and_or{$product_lc} || ',';
my $from = $from{$product_lc} || ':';

# Strawberries: Spain, Italy and Portugal
# Strawberries from Spain, Italy and Portugal
if ($$text_ref =~ /\s*([^,.;:]+)(?::|$from)\s*((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
Expand All @@ -1170,20 +1152,23 @@ sub match_origin_of_the_ingredient_origin($product_lc, $text_ref, $matched_ingre
);

my $origin_of_the_regexp = $origin_of_the_regexp_in_lc{$product_lc} || $origin_of_the_regexp_in_lc{en};
my $origins_regexp = $origins_regexps{$product_lc};
my $and_or = $and_or{$product_lc} || ',';

# Origin of the milk: United Kingdom.
if ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
# in order to not overmatch something like "Origin of milk: UK, some other mention."
# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
if ($origins_regexp
and ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i)) {

$matched_ingredient_ref->{ingredient} = $1;
$matched_ingredient_ref->{origins} = $2;
$matched_ingredient_ref->{matched_text} = $&;

# Remove the matched text
$$text_ref = $` . ' ' . $';

# replace and / or
#$matched_ingredient_ref->{origins} =~ s/($origins_regexp)(?:$and_or)($origins_regexp)/$1,$2/g;

return 1;
}
return 0;
Expand Down Expand Up @@ -1251,8 +1236,9 @@ sub parse_origins_from_text($product_ref, $text) {
};

if (defined $matched_ingredient_ref->{origins}) {
my $and_or = $and_or{$product_lc};
$specific_ingredients_ref->{origins} = join(",",
map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $matched_ingredient_ref->{origins}));
map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $matched_ingredient_ref->{origins}));
}

push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
Expand Down Expand Up @@ -4690,6 +4676,7 @@ sub preparse_ingredients_text($product_lc, $text) {
init_ingredients_processing_regexps();
init_additives_classes_regexps();
init_allergens_regexps();
init_origins_regexps();
}

my $and = $and{$product_lc} || " and ";
Expand Down
31 changes: 4 additions & 27 deletions lib/ProductOpener/Packaging.pm
Original file line number Diff line number Diff line change
Expand Up @@ -106,38 +106,15 @@ sub init_packaging_taxonomies_regexps() {

foreach my $taxonomy (values %packaging_taxonomies) {

$packaging_taxonomies_regexps{$taxonomy} = {}; # keys: languages

foreach my $tagid (get_all_taxonomy_entries($taxonomy)) {

foreach my $language (keys %{$translations_to{$taxonomy}{$tagid}}) {

defined $packaging_taxonomies_regexps{$taxonomy}{$language} or $packaging_taxonomies_regexps{$taxonomy}{$language} = [];

foreach my $synonym (get_taxonomy_tag_synonyms($language, $taxonomy, $tagid)) {

push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $synonym];

if ((my $unaccented_synonym = unac_string_perl($synonym)) ne $synonym) {

push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $unaccented_synonym];
}
}
$packaging_taxonomies_regexps{$taxonomy} =
generate_regexps_matching_taxonomy_entries($taxonomy, "list_of_regexps",
{
}
}

# We want to match the longest strings first

foreach my $language (keys %{$packaging_taxonomies_regexps{$taxonomy}}) {
@{$packaging_taxonomies_regexps{$taxonomy}{$language}}
= sort { length($b->[1]) <=> length($a->[1]) } @{$packaging_taxonomies_regexps{$taxonomy}{$language}};
}
);

$log->debug("init_packaging_taxonomies_regexps - result", { taxonomy => $taxonomy, packaging_taxonomies_regexps => $packaging_taxonomies_regexps{$taxonomy} }) if $log->is_debug();
}

# used only for debugging
#store("packaging_taxonomies_regexps.sto", \%packaging_taxonomies_regexps);
return;
}

Expand Down
97 changes: 97 additions & 0 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ BEGIN
&get_all_taxonomy_entries
&get_taxonomy_tag_synonyms
&generate_regexps_matching_taxonomy_entries
); # symbols to export on request
%EXPORT_TAGS = (all => [@EXPORT_OK]);
}
Expand Down Expand Up @@ -4224,7 +4226,102 @@ sub add_users_translations_to_taxonomy($tagtype) {
return;
}

=head2 generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref)
Create regular expressions that will match entries of a taxonomy.
=head3 Arguments
=head4 $taxonomy
The type of the tag (e.g. categories, labels, allergens)
=head4 $return_type - string
Either "unique_regexp" to get one single regexp for all entries of one language.
Or "list_of_regexps" to get a list of regexps (1 per entry) for each language.
For each entry, we return an array with the entry id, and the the regexp for that entry.
e.g. ['en:coffee',"coffee|coffees"]
=head4 $options_ref
A reference to a hash to enable options to indicate how to match:
- add_simple_plurals : in some languages, like French, we will allow an extra "s" at the end of entries
- add_simple_singulars: same with removing the "s" at the end of entries
- match_space_with_dash: spaces or dashes in entries will match either a space or a dash (e.g. "South America" will match "South-America")
=cut

sub generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref) {

# We will return for each language an unique regexp or a list of regexps
my $result_ref = {};

# Lists of synonyms regular expressions per language
my %synonyms_regexps = ();

foreach my $tagid ( get_all_taxonomy_entries($taxonomy) ) {

foreach my $language ( keys %{ $translations_to{$taxonomy}{$tagid} } ) {

defined $synonyms_regexps{$language} or $synonyms_regexps{$language} = [];

# the synonyms below also contain the main translation as the first entry

foreach my $synonym ( get_taxonomy_tag_synonyms($language, $taxonomy, $tagid) ) {

if ($options_ref->{add_simple_singulars}) {
if ($synonym =~ /s$/) {
# match entry without final s
$synonym =~ s/s$/\(\?:s\?\)/;
}
}

if ($options_ref->{add_simple_plurals}) {
if ($synonym !~ /s$/) {
# match entry with additional final s
$synonym =~ s/$/\(\?:s\?\)/;
}
}

if ($options_ref->{match_space_with_dash}) {
# Make spaces match dashes and the reverse
$synonym =~ s/( |-)/\(\?: \|-\)/g;
}

push @{ $synonyms_regexps{$language} },
[ $tagid, $synonym ];

if ( ( my $unaccented_synonym = unac_string_perl($synonym) ) ne $synonym ) {
push @{ $synonyms_regexps{$language} },
[ $tagid, $unaccented_synonym ];
}
}
}
}

# We want to match the longest strings first

if ($return_type eq 'unique_regexp') {
foreach my $language ( keys %synonyms_regexps ) {
$result_ref->{$language} = join('|',
map { $_->[1] }
sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} } );
}
}
elsif ($return_type eq 'list_of_regexps') {
foreach my $language ( keys %synonyms_regexps ) {
@{$result_ref->{$language}} = sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} };
}
}
else {
die("unknown return type for generate_regexps_matching_taxonomy_entries: $return_type - must be unique_regexp or list_of_regexps");
}

return $result_ref;
}


$log->info("Tags.pm loaded") if $log->is_info();
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"test:integration": "mocha --timeout 10000 -r dotenv/config",
"prove": "prove -l --jobs 2",
"lint": "npm run lint:js && npm run lint:css && npm run lint:scss",
"lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js test/*.js",
"lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js tests/integration/*.js",
"lint:css": "stylelint html/css/*.css",
"lint:scss": "stylelint scss/*.scss",
"perlc": "npm run perlc:startup && npm run perlc:cgi && npm run perlc:scripts",
Expand Down
File renamed without changes.
Loading

0 comments on commit f14ca5a

Please sign in to comment.