feat: improved parsing of origins of ingredients (#7398)

* refactor of tags regexps used in Ingredients.pm + improved parsing
openfoodfacts · Sep 27, 2022 · f14ca5a · f14ca5a
1 parent 2c09c66
commit f14ca5a
Show file tree

Hide file tree

Showing 17 changed files with 977 additions and 126 deletions.
diff --git a/docker/jslint.yml b/docker/jslint.yml
@@ -12,4 +12,4 @@ services:
       - ./.stylelintignore:/opt/product-opener/.stylelintignore
       # extra files to lint
       - ./scripts:/opt/product-opener/scripts
-      - ./test:/opt/product-opener/test
+      - ./tests:/opt/product-opener/tests
diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm
@@ -526,39 +526,35 @@ my %ingredients_processing_regexps = ();
 
 sub init_ingredients_processing_regexps() {
 
-	foreach my $ingredients_processing ( keys %{ $translations_to{ingredients_processing} } ) {
+	# Create a list of regexps with each synonyms of all ingredients processes
+	%ingredients_processing_regexps = %{
+		generate_regexps_matching_taxonomy_entries("ingredients_processing", "list_of_regexps",
+			{
+				#add_simple_plurals => 1,
+				#add_simple_singulars => 1,
+				match_space_with_dash => 1,
+			}
+		)
+	};
 
-		foreach my $l ( keys %{ $translations_to{ingredients_processing}{$ingredients_processing} } ) {
+	return;
+}
 
-			defined $ingredients_processing_regexps{$l}  or $ingredients_processing_regexps{$l}  = [];
 
-			# the synonyms below also contain the main translation as the first entry
+# Origins processing regexps
 
-			my $l_ingredients_processing = get_string_id_for_lang($l, $translations_to{ingredients_processing}{$ingredients_processing}{$l});
+my %origins_regexps = ();
 
-			foreach my $synonym ( @{$synonyms_for{ingredients_processing}{$l}{$l_ingredients_processing}} ) {
-				# Make spaces match dashes and the reverse
-				$synonym =~ s/( |-)/\(\?: \|-\)/g;
-				push @{ $ingredients_processing_regexps{$l} },
-					[ $ingredients_processing, $synonym ];
+sub init_origins_regexps() {
 
-				if ( ( my $unacc = unac_string_perl($synonym) ) ne $synonym ) {
-					push @{ $ingredients_processing_regexps{$l} },
-						[ $ingredients_processing, $unacc ];
-				}
+	# Create a list of regexps with each synonyms of all ingredients processes
+	%origins_regexps = %{
+		generate_regexps_matching_taxonomy_entries("origins", "unique_regexp",
+			{
+				match_space_with_dash => 1,
 			}
-		}
-	}
-
-	# We want to match the longest strings first
-	# Unfortunately, the following does not work:
-	# my $regexp = join('|', sort { length($b) <=> length($a) } keys %synonyms);
-	# -> if we have (gehackte|gehackt) and we parse "gehackte something", it will match "gehackt".
-	foreach my $lc ( keys %ingredients_processing_regexps ) {
-		@{ $ingredients_processing_regexps{$lc} }
-			= sort { length $b->[1] <=> length $a->[1] }
-			@{ $ingredients_processing_regexps{$lc} };
-	}
+		)
+	};
 
 	return;
 }
@@ -571,43 +567,23 @@ my %additives_classes_regexps = ();
 sub init_additives_classes_regexps() {
 
 	# Create a regexp with all synonyms of all additives classes
-	my %additives_classes_synonyms = ();
-
-	foreach my $additives_class (keys %{$translations_to{additives_classes}}) {
-
-		# do not turn vitamin a in vitamin : a-z
-		next if $additives_class eq "en:vitamins";
-
-		foreach my $l (keys %{$translations_to{additives_classes}{$additives_class}}) {
-
-			defined $additives_classes_synonyms{$l} or $additives_classes_synonyms{$l} = {};
-
-			# the synonyms below also contain the main translation as the first entry
-
-			my $l_additives_class = get_string_id_for_lang($l, $translations_to{additives_classes}{$additives_class}{$l});
-
-			foreach my $synonym (@{$synonyms_for{additives_classes}{$l}{$l_additives_class}}) {
-				$additives_classes_synonyms{$l}{$synonym} = 1;
-				# simple singulars and plurals + unaccented forms
-				$additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1;
-				$synonym =~ s/s$//;
-				$additives_classes_synonyms{$l}{$synonym} = 1;
-				$additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1;
-				$additives_classes_synonyms{$l}{$synonym . "s"} = 1;
-				$additives_classes_synonyms{$l}{unac_string_perl($synonym . "s")} = 1;
+	%additives_classes_regexps = %{
+		generate_regexps_matching_taxonomy_entries("additives_classes", "unique_regexp",
+			{
+				add_simple_plurals => 1,
+				add_simple_singulars => 1,
+				# 2022-09-22: not sure if the following is still needed
+				# before refactoring, we had a comment about not turning
+				# "vitamin A" into "vitamin : A", but it does not happen
+				# skip_entries_matching => '/^en:vitamins$/',
 			}
-		}
-	}
-
-	foreach my $l (sort keys %additives_classes_synonyms) {
-		# Match the longest strings first
-		$additives_classes_regexps{$l} = join('|', sort { length($b) <=> length($a) } keys %{$additives_classes_synonyms{$l}});
-		# print STDERR "additives_classes_regexps{$l}: " . $additives_classes_regexps{$l} . "\n";
-	}
+		)
+	};
 
 	return;
 }
 
+
 if ((keys %labels_regexps) > 0) { exit; }
 
 # load ingredients classes
@@ -1125,8 +1101,9 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) {
 				text => $matched_text,
 			};
 
+			my $and_or = $and_or{$product_lc};
 			defined $percent and $specific_ingredients_ref->{percent} = $percent + 0;
-			defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins ));
+			defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $origins ));
 
 			push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
 		}
@@ -1143,8 +1120,13 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) {
 
 sub match_ingredient_origin($product_lc, $text_ref, $matched_ingredient_ref) {
 
-	# Strawberries: Spain
-	if ($$text_ref =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) {
+	my $origins_regexp = $origins_regexps{$product_lc};
+	my $and_or = $and_or{$product_lc} || ',';
+	my $from = $from{$product_lc} || ':';
+
+	# Strawberries: Spain, Italy and Portugal
+	# Strawberries from Spain, Italy and Portugal
+	if ($$text_ref =~ /\s*([^,.;:]+)(?::|$from)\s*((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i) {
 		# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
 		# in order to not overmatch something like "Origin of milk: UK, some other mention."
 		# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
@@ -1170,20 +1152,23 @@ sub match_origin_of_the_ingredient_origin($product_lc, $text_ref, $matched_ingre
 	);
 
 	my $origin_of_the_regexp = $origin_of_the_regexp_in_lc{$product_lc} || $origin_of_the_regexp_in_lc{en};
+	my $origins_regexp = $origins_regexps{$product_lc};
+	my $and_or = $and_or{$product_lc} || ',';
 
 	# Origin of the milk: United Kingdom.
-	if ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) {
-		# Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE")
-		# in order to not overmatch something like "Origin of milk: UK, some other mention."
-		# In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy.
+	if ($origins_regexp
+		and ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i)) {
 
 		$matched_ingredient_ref->{ingredient} = $1;
 		$matched_ingredient_ref->{origins} = $2;
 		$matched_ingredient_ref->{matched_text} = $&;
-
+		
 		# Remove the matched text
 		$$text_ref = $` . ' ' . $';
 
+		# replace and / or
+		#$matched_ingredient_ref->{origins} =~ s/($origins_regexp)(?:$and_or)($origins_regexp)/$1,$2/g;		
+
 		return 1;
 	}
 	return 0;
@@ -1251,8 +1236,9 @@ sub parse_origins_from_text($product_ref, $text) {
 				};
 
 				if (defined $matched_ingredient_ref->{origins}) {
+					my $and_or = $and_or{$product_lc};
 					$specific_ingredients_ref->{origins} = join(",",
-						map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $matched_ingredient_ref->{origins}));
+						map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $matched_ingredient_ref->{origins}));
 				}
 
 				push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref;
@@ -4690,6 +4676,7 @@ sub preparse_ingredients_text($product_lc, $text) {
 		init_ingredients_processing_regexps();
 		init_additives_classes_regexps();
 		init_allergens_regexps();
+		init_origins_regexps();
 	}
 
 	my $and = $and{$product_lc} || " and ";

diff --git a/lib/ProductOpener/Packaging.pm b/lib/ProductOpener/Packaging.pm
@@ -106,38 +106,15 @@ sub init_packaging_taxonomies_regexps() {
 
 	foreach my $taxonomy (values %packaging_taxonomies) {
 
-		$packaging_taxonomies_regexps{$taxonomy} = {};	# keys: languages
-
-		foreach my $tagid (get_all_taxonomy_entries($taxonomy)) {
-
-			foreach my $language (keys %{$translations_to{$taxonomy}{$tagid}}) {
-
-				defined $packaging_taxonomies_regexps{$taxonomy}{$language} or $packaging_taxonomies_regexps{$taxonomy}{$language} = [];
-
-				foreach my $synonym (get_taxonomy_tag_synonyms($language, $taxonomy, $tagid)) {
-
-					push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $synonym];
-
-					if ((my $unaccented_synonym = unac_string_perl($synonym)) ne $synonym) {
-
-						push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $unaccented_synonym];
-					}
-				}
+		$packaging_taxonomies_regexps{$taxonomy} = 
+		generate_regexps_matching_taxonomy_entries($taxonomy, "list_of_regexps",
+			{
 			}
-		}
-
-		# We want to match the longest strings first
-
-		foreach my $language (keys %{$packaging_taxonomies_regexps{$taxonomy}}) {
-			@{$packaging_taxonomies_regexps{$taxonomy}{$language}}
-				= sort { length($b->[1]) <=> length($a->[1]) } @{$packaging_taxonomies_regexps{$taxonomy}{$language}};
-		}
+		);
 
 		$log->debug("init_packaging_taxonomies_regexps - result", { taxonomy => $taxonomy, packaging_taxonomies_regexps => $packaging_taxonomies_regexps{$taxonomy}  }) if $log->is_debug();
 	}
 
-	# used only for debugging
-	#store("packaging_taxonomies_regexps.sto", \%packaging_taxonomies_regexps);
 	return;
 }
 

diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm
@@ -152,6 +152,8 @@ BEGIN
 		&get_all_taxonomy_entries
 		&get_taxonomy_tag_synonyms
 
+		&generate_regexps_matching_taxonomy_entries
+
 		);    # symbols to export on request
 	%EXPORT_TAGS = (all => [@EXPORT_OK]);
 }
@@ -4224,7 +4226,102 @@ sub add_users_translations_to_taxonomy($tagtype) {
 	return;
 }
 
+=head2 generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref)
+
+Create regular expressions that will match entries of a taxonomy.
+
+=head3 Arguments
+
+=head4 $taxonomy
+
+The type of the tag (e.g. categories, labels, allergens)
+
+=head4 $return_type - string
+
+Either "unique_regexp" to get one single regexp for all entries of one language.
+
+Or "list_of_regexps" to get a list of regexps (1 per entry) for each language.
+For each entry, we return an array with the entry id, and the the regexp for that entry.
+e.g. ['en:coffee',"coffee|coffees"]
+
+=head4 $options_ref
+
+A reference to a hash to enable options to indicate how to match:
+
+- add_simple_plurals : in some languages, like French, we will allow an extra "s" at the end of entries
+- add_simple_singulars: same with removing the "s" at the end of entries
+- match_space_with_dash: spaces or dashes in entries will match either a space or a dash (e.g. "South America" will match "South-America")
+
+=cut
+
+sub generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref) {
+
+	# We will return for each language an unique regexp or a list of regexps
+	my $result_ref = {};
+
+	# Lists of synonyms regular expressions per language
+	my %synonyms_regexps = ();
+
+	foreach my $tagid ( get_all_taxonomy_entries($taxonomy) ) {
+
+		foreach my $language ( keys %{ $translations_to{$taxonomy}{$tagid} } ) {
+
+			defined $synonyms_regexps{$language} or $synonyms_regexps{$language}  = [];
+
+			# the synonyms below also contain the main translation as the first entry
+
+			foreach my $synonym ( get_taxonomy_tag_synonyms($language, $taxonomy, $tagid) ) {
 
+				if ($options_ref->{add_simple_singulars}) {
+					if ($synonym =~ /s$/) {
+						# match entry without final s
+						$synonym =~ s/s$/\(\?:s\?\)/;
+					}
+				}
+
+				if ($options_ref->{add_simple_plurals}) {
+					if ($synonym !~ /s$/) {
+						# match entry with additional final s
+						$synonym =~ s/$/\(\?:s\?\)/;
+					}
+				}				
+
+				if ($options_ref->{match_space_with_dash}) {
+					# Make spaces match dashes and the reverse
+					$synonym =~ s/( |-)/\(\?: \|-\)/g;
+				}
+
+				push @{ $synonyms_regexps{$language} },
+					[ $tagid, $synonym ];
+
+				if ( ( my $unaccented_synonym = unac_string_perl($synonym) ) ne $synonym ) {
+					push @{ $synonyms_regexps{$language} },
+						[ $tagid, $unaccented_synonym ];
+				}
+			}
+		}
+	}
+
+	# We want to match the longest strings first
+
+	if ($return_type eq 'unique_regexp') {
+		foreach my $language ( keys %synonyms_regexps ) {
+			$result_ref->{$language} = join('|',
+				map { $_->[1] }
+					sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} } );
+		}
+	}
+	elsif ($return_type eq 'list_of_regexps') {
+		foreach my $language ( keys %synonyms_regexps ) {
+			@{$result_ref->{$language}} = sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} };
+		}
+	}
+	else {
+		die("unknown return type for generate_regexps_matching_taxonomy_entries: $return_type - must be unique_regexp or list_of_regexps");
+	}
+
+	return $result_ref;
+}
 
 
 $log->info("Tags.pm loaded") if $log->is_info();

diff --git a/package.json b/package.json
@@ -15,7 +15,7 @@
     "test:integration": "mocha --timeout 10000 -r dotenv/config",
     "prove": "prove -l --jobs 2",
     "lint": "npm run lint:js && npm run lint:css && npm run lint:scss",
-    "lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js test/*.js",
+    "lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js tests/integration/*.js",
     "lint:css": "stylelint html/css/*.css",
     "lint:scss": "stylelint scss/*.scss",
     "perlc": "npm run perlc:startup && npm run perlc:cgi && npm run perlc:scripts",

diff --git a/test/query.test.js → tests/integration/query.test.js b/test/query.test.js → tests/integration/query.test.js