From f14ca5a2a80202885654e9691bd894c3af3b8c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Tue, 27 Sep 2022 11:35:06 +0200 Subject: [PATCH] feat: improved parsing of origins of ingredients (#7398) * refactor of tags regexps used in Ingredients.pm + improved parsing --- docker/jslint.yml | 2 +- lib/ProductOpener/Ingredients.pm | 121 +++---- lib/ProductOpener/Packaging.pm | 31 +- lib/ProductOpener/Tags.pm | 97 ++++++ package.json | 2 +- {test => tests/integration}/query.test.js | 0 tests/unit/additives_tags.t | 45 +++ .../en-origin-field-with-commas-and.json | 84 +++++ .../en-origin-field-with-commas.json | 77 +++++ .../en-origin-ingredient-from-origin.json | 79 +++++ ...n-origin-ingredient-origin-and-origin.json | 79 +++++ .../ingredients/en-vitamin.json | 60 ++++ .../ingredients/fr-origin-field.json | 12 + ...r-origin-ingredient-origin-and-origin.json | 310 ++++++++++++++++++ tests/unit/ingredients.t | 102 ++++-- {t => tests/unit}/tags_unit.t | 0 tests/update_tests_results.sh | 2 +- 17 files changed, 977 insertions(+), 126 deletions(-) rename {test => tests/integration}/query.test.js (100%) create mode 100644 tests/unit/expected_test_results/ingredients/en-origin-field-with-commas-and.json create mode 100644 tests/unit/expected_test_results/ingredients/en-origin-field-with-commas.json create mode 100644 tests/unit/expected_test_results/ingredients/en-origin-ingredient-from-origin.json create mode 100644 tests/unit/expected_test_results/ingredients/en-origin-ingredient-origin-and-origin.json create mode 100644 tests/unit/expected_test_results/ingredients/en-vitamin.json create mode 100644 tests/unit/expected_test_results/ingredients/fr-origin-ingredient-origin-and-origin.json rename {t => tests/unit}/tags_unit.t (100%) diff --git a/docker/jslint.yml b/docker/jslint.yml index f68ec15616d32..6e2fa8d1beff8 100644 --- a/docker/jslint.yml +++ b/docker/jslint.yml @@ -12,4 +12,4 @@ services: - ./.stylelintignore:/opt/product-opener/.stylelintignore # extra files to lint - ./scripts:/opt/product-opener/scripts - - ./test:/opt/product-opener/test + - ./tests:/opt/product-opener/tests diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index f376ad04084c4..19444e18435e3 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -526,39 +526,35 @@ my %ingredients_processing_regexps = (); sub init_ingredients_processing_regexps() { - foreach my $ingredients_processing ( keys %{ $translations_to{ingredients_processing} } ) { + # Create a list of regexps with each synonyms of all ingredients processes + %ingredients_processing_regexps = %{ + generate_regexps_matching_taxonomy_entries("ingredients_processing", "list_of_regexps", + { + #add_simple_plurals => 1, + #add_simple_singulars => 1, + match_space_with_dash => 1, + } + ) + }; - foreach my $l ( keys %{ $translations_to{ingredients_processing}{$ingredients_processing} } ) { + return; +} - defined $ingredients_processing_regexps{$l} or $ingredients_processing_regexps{$l} = []; - # the synonyms below also contain the main translation as the first entry +# Origins processing regexps - my $l_ingredients_processing = get_string_id_for_lang($l, $translations_to{ingredients_processing}{$ingredients_processing}{$l}); +my %origins_regexps = (); - foreach my $synonym ( @{$synonyms_for{ingredients_processing}{$l}{$l_ingredients_processing}} ) { - # Make spaces match dashes and the reverse - $synonym =~ s/( |-)/\(\?: \|-\)/g; - push @{ $ingredients_processing_regexps{$l} }, - [ $ingredients_processing, $synonym ]; +sub init_origins_regexps() { - if ( ( my $unacc = unac_string_perl($synonym) ) ne $synonym ) { - push @{ $ingredients_processing_regexps{$l} }, - [ $ingredients_processing, $unacc ]; - } + # Create a list of regexps with each synonyms of all ingredients processes + %origins_regexps = %{ + generate_regexps_matching_taxonomy_entries("origins", "unique_regexp", + { + match_space_with_dash => 1, } - } - } - - # We want to match the longest strings first - # Unfortunately, the following does not work: - # my $regexp = join('|', sort { length($b) <=> length($a) } keys %synonyms); - # -> if we have (gehackte|gehackt) and we parse "gehackte something", it will match "gehackt". - foreach my $lc ( keys %ingredients_processing_regexps ) { - @{ $ingredients_processing_regexps{$lc} } - = sort { length $b->[1] <=> length $a->[1] } - @{ $ingredients_processing_regexps{$lc} }; - } + ) + }; return; } @@ -571,43 +567,23 @@ my %additives_classes_regexps = (); sub init_additives_classes_regexps() { # Create a regexp with all synonyms of all additives classes - my %additives_classes_synonyms = (); - - foreach my $additives_class (keys %{$translations_to{additives_classes}}) { - - # do not turn vitamin a in vitamin : a-z - next if $additives_class eq "en:vitamins"; - - foreach my $l (keys %{$translations_to{additives_classes}{$additives_class}}) { - - defined $additives_classes_synonyms{$l} or $additives_classes_synonyms{$l} = {}; - - # the synonyms below also contain the main translation as the first entry - - my $l_additives_class = get_string_id_for_lang($l, $translations_to{additives_classes}{$additives_class}{$l}); - - foreach my $synonym (@{$synonyms_for{additives_classes}{$l}{$l_additives_class}}) { - $additives_classes_synonyms{$l}{$synonym} = 1; - # simple singulars and plurals + unaccented forms - $additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1; - $synonym =~ s/s$//; - $additives_classes_synonyms{$l}{$synonym} = 1; - $additives_classes_synonyms{$l}{unac_string_perl($synonym)} = 1; - $additives_classes_synonyms{$l}{$synonym . "s"} = 1; - $additives_classes_synonyms{$l}{unac_string_perl($synonym . "s")} = 1; + %additives_classes_regexps = %{ + generate_regexps_matching_taxonomy_entries("additives_classes", "unique_regexp", + { + add_simple_plurals => 1, + add_simple_singulars => 1, + # 2022-09-22: not sure if the following is still needed + # before refactoring, we had a comment about not turning + # "vitamin A" into "vitamin : A", but it does not happen + # skip_entries_matching => '/^en:vitamins$/', } - } - } - - foreach my $l (sort keys %additives_classes_synonyms) { - # Match the longest strings first - $additives_classes_regexps{$l} = join('|', sort { length($b) <=> length($a) } keys %{$additives_classes_synonyms{$l}}); - # print STDERR "additives_classes_regexps{$l}: " . $additives_classes_regexps{$l} . "\n"; - } + ) + }; return; } + if ((keys %labels_regexps) > 0) { exit; } # load ingredients classes @@ -1125,8 +1101,9 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) { text => $matched_text, }; + my $and_or = $and_or{$product_lc}; defined $percent and $specific_ingredients_ref->{percent} = $percent + 0; - defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $origins )); + defined $origins and $specific_ingredients_ref->{origins} = join(",", map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $origins )); push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref; } @@ -1143,8 +1120,13 @@ sub parse_specific_ingredients_from_text($product_ref, $text, $percent_regexp) { sub match_ingredient_origin($product_lc, $text_ref, $matched_ingredient_ref) { - # Strawberries: Spain - if ($$text_ref =~ /\s*([^,.;:]+)(?::)\s*([^,.;]+?)\s*(?:;|\.| - |$)/i) { + my $origins_regexp = $origins_regexps{$product_lc}; + my $and_or = $and_or{$product_lc} || ','; + my $from = $from{$product_lc} || ':'; + + # Strawberries: Spain, Italy and Portugal + # Strawberries from Spain, Italy and Portugal + if ($$text_ref =~ /\s*([^,.;:]+)(?::|$from)\s*((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i) { # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") # in order to not overmatch something like "Origin of milk: UK, some other mention." # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. @@ -1170,20 +1152,23 @@ sub match_origin_of_the_ingredient_origin($product_lc, $text_ref, $matched_ingre ); my $origin_of_the_regexp = $origin_of_the_regexp_in_lc{$product_lc} || $origin_of_the_regexp_in_lc{en}; + my $origins_regexp = $origins_regexps{$product_lc}; + my $and_or = $and_or{$product_lc} || ','; # Origin of the milk: United Kingdom. - if ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+([^,.;]+?)\s*(?:;|\.| - |$)/i) { - # Note: the regexp above does not currently match multiple origins with commas (e.g. "Origins of milk: UK, UE") - # in order to not overmatch something like "Origin of milk: UK, some other mention." - # In the future, we could try to be smarter and match more if we can recognize the next words exist in the origins taxonomy. + if ($origins_regexp + and ($$text_ref =~ /\s*${origin_of_the_regexp}([^,.;:]+)(?::| )+((?:$origins_regexp)(?:(?:,|$and_or)(?:\s?)(?:$origins_regexp))*)\s*(?:,|;|\.| - |$)/i)) { $matched_ingredient_ref->{ingredient} = $1; $matched_ingredient_ref->{origins} = $2; $matched_ingredient_ref->{matched_text} = $&; - + # Remove the matched text $$text_ref = $` . ' ' . $'; + # replace and / or + #$matched_ingredient_ref->{origins} =~ s/($origins_regexp)(?:$and_or)($origins_regexp)/$1,$2/g; + return 1; } return 0; @@ -1251,8 +1236,9 @@ sub parse_origins_from_text($product_ref, $text) { }; if (defined $matched_ingredient_ref->{origins}) { + my $and_or = $and_or{$product_lc}; $specific_ingredients_ref->{origins} = join(",", - map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,/, $matched_ingredient_ref->{origins})); + map {canonicalize_taxonomy_tag($product_lc, "origins", $_)} split(/,|$and_or/, $matched_ingredient_ref->{origins})); } push @{$product_ref->{specific_ingredients}}, $specific_ingredients_ref; @@ -4690,6 +4676,7 @@ sub preparse_ingredients_text($product_lc, $text) { init_ingredients_processing_regexps(); init_additives_classes_regexps(); init_allergens_regexps(); + init_origins_regexps(); } my $and = $and{$product_lc} || " and "; diff --git a/lib/ProductOpener/Packaging.pm b/lib/ProductOpener/Packaging.pm index 174cada7f2cd6..8c690379c5e88 100644 --- a/lib/ProductOpener/Packaging.pm +++ b/lib/ProductOpener/Packaging.pm @@ -106,38 +106,15 @@ sub init_packaging_taxonomies_regexps() { foreach my $taxonomy (values %packaging_taxonomies) { - $packaging_taxonomies_regexps{$taxonomy} = {}; # keys: languages - - foreach my $tagid (get_all_taxonomy_entries($taxonomy)) { - - foreach my $language (keys %{$translations_to{$taxonomy}{$tagid}}) { - - defined $packaging_taxonomies_regexps{$taxonomy}{$language} or $packaging_taxonomies_regexps{$taxonomy}{$language} = []; - - foreach my $synonym (get_taxonomy_tag_synonyms($language, $taxonomy, $tagid)) { - - push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $synonym]; - - if ((my $unaccented_synonym = unac_string_perl($synonym)) ne $synonym) { - - push @{$packaging_taxonomies_regexps{$taxonomy}{$language}}, [$tagid, $unaccented_synonym]; - } - } + $packaging_taxonomies_regexps{$taxonomy} = + generate_regexps_matching_taxonomy_entries($taxonomy, "list_of_regexps", + { } - } - - # We want to match the longest strings first - - foreach my $language (keys %{$packaging_taxonomies_regexps{$taxonomy}}) { - @{$packaging_taxonomies_regexps{$taxonomy}{$language}} - = sort { length($b->[1]) <=> length($a->[1]) } @{$packaging_taxonomies_regexps{$taxonomy}{$language}}; - } + ); $log->debug("init_packaging_taxonomies_regexps - result", { taxonomy => $taxonomy, packaging_taxonomies_regexps => $packaging_taxonomies_regexps{$taxonomy} }) if $log->is_debug(); } - # used only for debugging - #store("packaging_taxonomies_regexps.sto", \%packaging_taxonomies_regexps); return; } diff --git a/lib/ProductOpener/Tags.pm b/lib/ProductOpener/Tags.pm index d047572322466..d24288787c81b 100644 --- a/lib/ProductOpener/Tags.pm +++ b/lib/ProductOpener/Tags.pm @@ -152,6 +152,8 @@ BEGIN &get_all_taxonomy_entries &get_taxonomy_tag_synonyms + &generate_regexps_matching_taxonomy_entries + ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } @@ -4224,7 +4226,102 @@ sub add_users_translations_to_taxonomy($tagtype) { return; } +=head2 generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref) + +Create regular expressions that will match entries of a taxonomy. + +=head3 Arguments + +=head4 $taxonomy + +The type of the tag (e.g. categories, labels, allergens) + +=head4 $return_type - string + +Either "unique_regexp" to get one single regexp for all entries of one language. + +Or "list_of_regexps" to get a list of regexps (1 per entry) for each language. +For each entry, we return an array with the entry id, and the the regexp for that entry. +e.g. ['en:coffee',"coffee|coffees"] + +=head4 $options_ref + +A reference to a hash to enable options to indicate how to match: + +- add_simple_plurals : in some languages, like French, we will allow an extra "s" at the end of entries +- add_simple_singulars: same with removing the "s" at the end of entries +- match_space_with_dash: spaces or dashes in entries will match either a space or a dash (e.g. "South America" will match "South-America") + +=cut + +sub generate_regexps_matching_taxonomy_entries($taxonomy, $return_type, $options_ref) { + + # We will return for each language an unique regexp or a list of regexps + my $result_ref = {}; + + # Lists of synonyms regular expressions per language + my %synonyms_regexps = (); + + foreach my $tagid ( get_all_taxonomy_entries($taxonomy) ) { + + foreach my $language ( keys %{ $translations_to{$taxonomy}{$tagid} } ) { + + defined $synonyms_regexps{$language} or $synonyms_regexps{$language} = []; + + # the synonyms below also contain the main translation as the first entry + + foreach my $synonym ( get_taxonomy_tag_synonyms($language, $taxonomy, $tagid) ) { + if ($options_ref->{add_simple_singulars}) { + if ($synonym =~ /s$/) { + # match entry without final s + $synonym =~ s/s$/\(\?:s\?\)/; + } + } + + if ($options_ref->{add_simple_plurals}) { + if ($synonym !~ /s$/) { + # match entry with additional final s + $synonym =~ s/$/\(\?:s\?\)/; + } + } + + if ($options_ref->{match_space_with_dash}) { + # Make spaces match dashes and the reverse + $synonym =~ s/( |-)/\(\?: \|-\)/g; + } + + push @{ $synonyms_regexps{$language} }, + [ $tagid, $synonym ]; + + if ( ( my $unaccented_synonym = unac_string_perl($synonym) ) ne $synonym ) { + push @{ $synonyms_regexps{$language} }, + [ $tagid, $unaccented_synonym ]; + } + } + } + } + + # We want to match the longest strings first + + if ($return_type eq 'unique_regexp') { + foreach my $language ( keys %synonyms_regexps ) { + $result_ref->{$language} = join('|', + map { $_->[1] } + sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} } ); + } + } + elsif ($return_type eq 'list_of_regexps') { + foreach my $language ( keys %synonyms_regexps ) { + @{$result_ref->{$language}} = sort { length $b->[1] <=> length $a->[1] } @{ $synonyms_regexps{$language} }; + } + } + else { + die("unknown return type for generate_regexps_matching_taxonomy_entries: $return_type - must be unique_regexp or list_of_regexps"); + } + + return $result_ref; +} $log->info("Tags.pm loaded") if $log->is_info(); diff --git a/package.json b/package.json index 6ae901e90fe72..413c8005433a4 100644 --- a/package.json +++ b/package.json @@ -15,7 +15,7 @@ "test:integration": "mocha --timeout 10000 -r dotenv/config", "prove": "prove -l --jobs 2", "lint": "npm run lint:js && npm run lint:css && npm run lint:scss", - "lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js test/*.js", + "lint:js": "eslint gulpfile.js html/js/*.js scripts/*.js tests/integration/*.js", "lint:css": "stylelint html/css/*.css", "lint:scss": "stylelint scss/*.scss", "perlc": "npm run perlc:startup && npm run perlc:cgi && npm run perlc:scripts", diff --git a/test/query.test.js b/tests/integration/query.test.js similarity index 100% rename from test/query.test.js rename to tests/integration/query.test.js diff --git a/tests/unit/additives_tags.t b/tests/unit/additives_tags.t index 5e29ec34aa022..141e62b9ed0f3 100755 --- a/tests/unit/additives_tags.t +++ b/tests/unit/additives_tags.t @@ -162,6 +162,51 @@ my @tests = ( [{lc => "es", ingredients_text => "lecitina de girasol"}, ["en:e322i"]], + # Mandatory additives classes + + [ + { + lc => 'en', + ingredients_text => "amaranth" + }, + [] + ], + + [ + { + lc => 'en', + ingredients_text => "colour: amaranth" + }, + ['en:e123'] + ], + + # additive class followed by an ingredient, without a : + # this will test the additives entries regexps in Ingredients.pm + [ + { + lc => 'en', + ingredients_text => "flour treatment agent potassium iodate" + }, + ['en:e917'] + ], + + # plural of additive class followed by an ingredient, without a : + [ + { + lc => 'en', + ingredients_text => "flour treatment agents potassium iodate" + }, + ['en:e917'] + ], + + [ + { + lc => 'en', + ingredients_text => "vitamin A" + }, + [] + ], + ); foreach my $test_ref (@tests) { diff --git a/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas-and.json b/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas-and.json new file mode 100644 index 0000000000000..0b944cc42424b --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas-and.json @@ -0,0 +1,84 @@ +{ + "ingredients" : [ + { + "id" : "en:milk", + "origins" : "en:united-kingdom,en:european-union", + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "Milk", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:sugar", + "origins" : "en:paraguay,en:uruguay,en:costa-rica", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "sugar", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : { + "en:non-vegan" : [ + "en:milk" + ] + }, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:non-vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:milk", + "en:dairy", + "en:sugar", + "en:added-sugar", + "en:disaccharide" + ], + "ingredients_n" : 2, + "ingredients_n_tags" : [ + "2", + "1-10" + ], + "ingredients_original_tags" : [ + "en:milk", + "en:sugar" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:milk", + "en:dairy", + "en:sugar", + "en:added-sugar", + "en:disaccharide" + ], + "ingredients_text" : "Milk, sugar. Origin of the milk: UK, European Union. Origin of sugar: Paraguay, Uruguay and Costa Rica.", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 2, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 5, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "specific_ingredients" : [ + { + "id" : "en:milk", + "ingredient" : "milk", + "origins" : "en:united-kingdom,en:european-union", + "text" : "Origin of the milk: UK, European Union." + }, + { + "id" : "en:sugar", + "ingredient" : "sugar", + "origins" : "en:paraguay,en:uruguay,en:costa-rica", + "text" : "Origin of sugar: Paraguay, Uruguay and Costa Rica" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas.json b/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas.json new file mode 100644 index 0000000000000..133ac1c98406b --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/en-origin-field-with-commas.json @@ -0,0 +1,77 @@ +{ + "ingredients" : [ + { + "id" : "en:milk", + "origins" : "en:belgium,en:spain", + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "Milk", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:sugar", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "sugar", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : { + "en:non-vegan" : [ + "en:milk" + ] + }, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:non-vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:milk", + "en:dairy", + "en:sugar", + "en:added-sugar", + "en:disaccharide" + ], + "ingredients_n" : 2, + "ingredients_n_tags" : [ + "2", + "1-10" + ], + "ingredients_original_tags" : [ + "en:milk", + "en:sugar" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:milk", + "en:dairy", + "en:sugar", + "en:added-sugar", + "en:disaccharide" + ], + "ingredients_text" : "Milk, sugar. Origin of the milk: Belgium, Spain", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 2, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 5, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "specific_ingredients" : [ + { + "id" : "en:milk", + "ingredient" : "milk", + "origins" : "en:belgium,en:spain", + "text" : "Origin of the milk: Belgium, Spain" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/expected_test_results/ingredients/en-origin-ingredient-from-origin.json b/tests/unit/expected_test_results/ingredients/en-origin-ingredient-from-origin.json new file mode 100644 index 0000000000000..b2897440a8aa5 --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/en-origin-ingredient-from-origin.json @@ -0,0 +1,79 @@ +{ + "ingredients" : [ + { + "id" : "en:red-bell-pepper", + "origins" : "en:spain,en:italy,en:france", + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "Red peppers", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:yellow-bell-pepper", + "origins" : "en:south-america", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "yellow peppers", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : {}, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:red-bell-pepper", + "en:vegetable", + "en:bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_n" : 2, + "ingredients_n_tags" : [ + "2", + "1-10" + ], + "ingredients_original_tags" : [ + "en:red-bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:red-bell-pepper", + "en:vegetable", + "en:bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_text" : "Red peppers, yellow peppers", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 2, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 4, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 50, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 50 + }, + "origin_en" : "Red peppers from Spain, Italy and France, Yellow peppers from South America", + "specific_ingredients" : [ + { + "id" : "en:red-bell-pepper", + "ingredient" : "Red peppers", + "origins" : "en:spain,en:italy,en:france", + "text" : "Red peppers from Spain, Italy and France," + }, + { + "id" : "en:yellow-bell-pepper", + "ingredient" : "Yellow peppers", + "origins" : "en:south-america", + "text" : "Yellow peppers from South America" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/expected_test_results/ingredients/en-origin-ingredient-origin-and-origin.json b/tests/unit/expected_test_results/ingredients/en-origin-ingredient-origin-and-origin.json new file mode 100644 index 0000000000000..571ec1820b59e --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/en-origin-ingredient-origin-and-origin.json @@ -0,0 +1,79 @@ +{ + "ingredients" : [ + { + "id" : "en:red-bell-pepper", + "origins" : "en:spain,en:south-america", + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "Red peppers", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:yellow-bell-pepper", + "origins" : "en:mexico,en:canada,en:california", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "yellow peppers", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : {}, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:red-bell-pepper", + "en:vegetable", + "en:bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_n" : 2, + "ingredients_n_tags" : [ + "2", + "1-10" + ], + "ingredients_original_tags" : [ + "en:red-bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:red-bell-pepper", + "en:vegetable", + "en:bell-pepper", + "en:yellow-bell-pepper" + ], + "ingredients_text" : "Red peppers, yellow peppers", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 2, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 4, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 50, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 50 + }, + "origin_en" : "Red peppers: Spain or South America, Yellow peppers: Mexico, Canada and California", + "specific_ingredients" : [ + { + "id" : "en:red-bell-pepper", + "ingredient" : "Red peppers", + "origins" : "en:spain,en:south-america", + "text" : "Red peppers: Spain or South America," + }, + { + "id" : "en:yellow-bell-pepper", + "ingredient" : "Yellow peppers", + "origins" : "en:mexico,en:canada,en:california", + "text" : "Yellow peppers: Mexico, Canada and California" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/expected_test_results/ingredients/en-vitamin.json b/tests/unit/expected_test_results/ingredients/en-vitamin.json new file mode 100644 index 0000000000000..acb46e41ad0bc --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/en-vitamin.json @@ -0,0 +1,60 @@ +{ + "ingredients" : [ + { + "id" : "en:vitamin-a", + "percent_estimate" : 75, + "percent_max" : 100, + "percent_min" : 50, + "text" : "vitamin a", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:salt", + "percent_estimate" : 25, + "percent_max" : 50, + "percent_min" : 0, + "text" : "salt", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : {}, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:vegan", + "en:vegetarian" + ], + "ingredients_hierarchy" : [ + "en:vitamin-a", + "en:vitamins", + "en:salt" + ], + "ingredients_n" : 2, + "ingredients_n_tags" : [ + "2", + "1-10" + ], + "ingredients_original_tags" : [ + "en:vitamin-a", + "en:salt" + ], + "ingredients_percent_analysis" : 1, + "ingredients_tags" : [ + "en:vitamin-a", + "en:vitamins", + "en:salt" + ], + "ingredients_text" : "vitamin a, salt", + "ingredients_with_specified_percent_n" : 0, + "ingredients_with_specified_percent_sum" : 0, + "ingredients_with_unspecified_percent_n" : 2, + "ingredients_with_unspecified_percent_sum" : 100, + "known_ingredients_n" : 3, + "lc" : "en", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/expected_test_results/ingredients/fr-origin-field.json b/tests/unit/expected_test_results/ingredients/fr-origin-field.json index d2154ef0532ab..188026d85f254 100644 --- a/tests/unit/expected_test_results/ingredients/fr-origin-field.json +++ b/tests/unit/expected_test_results/ingredients/fr-origin-field.json @@ -223,6 +223,12 @@ "origins" : "en:italy", "text" : "Origine du parmesan : Italie." }, + { + "id" : "en:rice", + "ingredient" : "riz", + "origins" : "en:india,en:thailand", + "text" : "Origine du riz : Inde, Thaïlande." + }, { "id" : "en:white-wine", "ingredient" : "Vin blanc", @@ -235,6 +241,12 @@ "origins" : "en:france", "text" : "Origine Crème UHT : France." }, + { + "id" : "en:tomato", + "ingredient" : "Tomates", + "origins" : "en:italy", + "text" : "Tomates d'Italie." + }, { "id" : "en:pepper", "ingredient" : "poivre", diff --git a/tests/unit/expected_test_results/ingredients/fr-origin-ingredient-origin-and-origin.json b/tests/unit/expected_test_results/ingredients/fr-origin-ingredient-origin-and-origin.json new file mode 100644 index 0000000000000..ca316dcd949f7 --- /dev/null +++ b/tests/unit/expected_test_results/ingredients/fr-origin-ingredient-origin-and-origin.json @@ -0,0 +1,310 @@ +{ + "ingredients" : [ + { + "id" : "en:potato", + "origins" : "en:france", + "percent" : 47, + "percent_estimate" : 47, + "text" : "Pomme de Terre", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:pork", + "origins" : "en:france", + "percent" : 22, + "percent_estimate" : 22, + "text" : "Porc", + "vegan" : "no", + "vegetarian" : "no" + }, + { + "id" : "en:semi-skimmed-milk", + "origins" : "en:france", + "percent" : 5.5, + "percent_estimate" : 5.5, + "text" : "Lait demi-écrémé", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:single-cream", + "origins" : "en:france", + "percent" : 5.5, + "percent_estimate" : 5.5, + "text" : "Crème liquide", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:water", + "origins" : "en:france", + "percent" : 5.5, + "percent_estimate" : 5.5, + "text" : "Eau", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:butter", + "origins" : "en:france", + "percent" : 2.7, + "percent_estimate" : 2.7, + "text" : "Beurre", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "fr:moutarde-a-l-ancienne", + "ingredients" : [], + "origins" : "en:france", + "percent" : 2.7, + "percent_estimate" : 2.7, + "text" : "Moutarde à l'ancienne", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:cream", + "origins" : "en:france", + "percent" : 2.7, + "percent_estimate" : 2.7, + "text" : "Crème", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:dijon-mustard", + "ingredients" : [], + "origins" : "en:france", + "percent" : 2.7, + "percent_estimate" : 2.7, + "text" : "Moutarde de Dijon", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:flower-honey", + "origins" : "en:france", + "percent" : 2.7, + "percent_estimate" : 2.7, + "text" : "Miel de fleurs", + "vegan" : "no", + "vegetarian" : "yes" + }, + { + "id" : "en:spice", + "origins" : "en:india", + "percent" : 0.55, + "percent_estimate" : 0.55, + "text" : "Epices", + "vegan" : "yes", + "vegetarian" : "yes" + }, + { + "id" : "en:broth", + "ingredients" : [], + "origins" : "en:france", + "percent" : 0.55, + "percent_estimate" : 0.449999999999989, + "text" : "bouillon" + }, + { + "id" : "en:finegrained-salt", + "origins" : "en:france", + "percent" : 0.14, + "percent_estimate" : 0, + "text" : "Sel fin", + "vegan" : "yes", + "vegetarian" : "yes" + } + ], + "ingredients_analysis" : { + "en:non-vegan" : [ + "en:pork", + "en:semi-skimmed-milk", + "en:single-cream", + "en:butter", + "en:cream", + "en:flower-honey" + ], + "en:non-vegetarian" : [ + "en:pork" + ], + "en:vegan-status-unknown" : [ + "en:broth" + ], + "en:vegetarian-status-unknown" : [ + "en:broth" + ] + }, + "ingredients_analysis_tags" : [ + "en:palm-oil-free", + "en:non-vegan", + "en:non-vegetarian" + ], + "ingredients_hierarchy" : [ + "en:potato", + "en:vegetable", + "en:root-vegetable", + "en:pork", + "en:animal", + "en:semi-skimmed-milk", + "en:dairy", + "en:milk", + "en:single-cream", + "en:cream", + "en:water", + "en:butter", + "fr:moutarde-a-l-ancienne", + "en:condiment", + "en:spice", + "en:mustard", + "en:dijon-mustard", + "en:flower-honey", + "en:added-sugar", + "en:honey", + "en:broth", + "en:finegrained-salt", + "en:salt" + ], + "ingredients_n" : 13, + "ingredients_n_tags" : [ + "13", + "11-20" + ], + "ingredients_original_tags" : [ + "en:potato", + "en:pork", + "en:semi-skimmed-milk", + "en:single-cream", + "en:water", + "en:butter", + "fr:moutarde-a-l-ancienne", + "en:cream", + "en:dijon-mustard", + "en:flower-honey", + "en:spice", + "en:broth", + "en:finegrained-salt" + ], + "ingredients_percent_analysis" : -1, + "ingredients_tags" : [ + "en:potato", + "en:vegetable", + "en:root-vegetable", + "en:pork", + "en:animal", + "en:semi-skimmed-milk", + "en:dairy", + "en:milk", + "en:single-cream", + "en:cream", + "en:water", + "en:butter", + "fr:moutarde-a-l-ancienne", + "en:condiment", + "en:spice", + "en:mustard", + "en:dijon-mustard", + "en:flower-honey", + "en:added-sugar", + "en:honey", + "en:broth", + "en:finegrained-salt", + "en:salt" + ], + "ingredients_text" : "Pomme de Terre 47%, Porc 22%, Lait demi-écrémé (contient Lait) 5.5%, Crème liquide (contient Lait) 5.5%, Eau 5.5%,\n\t\t\tBeurre (contient Lait) 2.7%, Moutarde à l'ancienne (contient Moutarde, Sulfites) 2.7%, Crème (contient Lait) 2.7%, Moutarde de Dijon (contient Moutarde, Sulfites) 2.7%,\n\t\t\tMiel de fleurs 2.7%, Epices (contient Sésame) 0.55%, bouillon (contient Gluten, Lait, Céleri) 0.55%, Sel fin 0.14%", + "ingredients_with_specified_percent_n" : 10, + "ingredients_with_specified_percent_sum" : 94.29, + "ingredients_with_unspecified_percent_n" : 0, + "ingredients_with_unspecified_percent_sum" : 0, + "known_ingredients_n" : 23, + "lc" : "fr", + "nutriments" : { + "fruits-vegetables-nuts-estimate-from-ingredients_100g" : 0, + "fruits-vegetables-nuts-estimate-from-ingredients_serving" : 0 + }, + "origin_fr" : "Pomme de Terre de France, Porc de France, Lait demi-écrémé de France, Crème liquide de France, Eau de France, Beurre de France, \n\t\t\t\tMoutarde à l'ancienne de France, Crème de France, Moutarde de Dijon de France, Miel de fleurs de France, Epices : Inde, Bouillon de France, Sel fin de France", + "specific_ingredients" : [ + { + "id" : "en:potato", + "ingredient" : "Pomme de Terre", + "origins" : "en:france", + "text" : "Pomme de Terre de France," + }, + { + "id" : "en:pork", + "ingredient" : "Porc", + "origins" : "en:france", + "text" : "Porc de France," + }, + { + "id" : "en:semi-skimmed-milk", + "ingredient" : "Lait demi-écrémé", + "origins" : "en:france", + "text" : "Lait demi-écrémé de France," + }, + { + "id" : "en:single-cream", + "ingredient" : "Crème liquide", + "origins" : "en:france", + "text" : "Crème liquide de France," + }, + { + "id" : "en:water", + "ingredient" : "Eau", + "origins" : "en:france", + "text" : "Eau de France," + }, + { + "id" : "en:butter", + "ingredient" : "Beurre", + "origins" : "en:france", + "text" : "Beurre de France," + }, + { + "id" : "fr:moutarde-a-l-ancienne", + "ingredient" : "Moutarde à l'ancienne", + "origins" : "en:france", + "text" : "Moutarde à l'ancienne de France," + }, + { + "id" : "en:cream", + "ingredient" : "Crème", + "origins" : "en:france", + "text" : "Crème de France," + }, + { + "id" : "en:dijon-mustard", + "ingredient" : "Moutarde de Dijon", + "origins" : "en:france", + "text" : "Moutarde de Dijon de France," + }, + { + "id" : "en:flower-honey", + "ingredient" : "Miel de fleurs", + "origins" : "en:france", + "text" : "Miel de fleurs de France," + }, + { + "id" : "en:spice", + "ingredient" : "Epices", + "origins" : "en:india", + "text" : "Epices : Inde," + }, + { + "id" : "en:broth", + "ingredient" : "Bouillon", + "origins" : "en:france", + "text" : "Bouillon de France," + }, + { + "id" : "en:finegrained-salt", + "ingredient" : "Sel fin", + "origins" : "en:france", + "text" : "Sel fin de France" + } + ], + "unknown_ingredients_n" : 0 +} diff --git a/tests/unit/ingredients.t b/tests/unit/ingredients.t index bb1b1c5e865ee..c687c31f1b132 100755 --- a/tests/unit/ingredients.t +++ b/tests/unit/ingredients.t @@ -14,29 +14,33 @@ use ProductOpener::Config qw/:all/; use ProductOpener::Tags qw/:all/; use ProductOpener::TagsEntries qw/:all/; use ProductOpener::Ingredients qw/:all/; +use ProductOpener::Test qw/:all/; -my $expected_dir = dirname(__FILE__) . "/expected_test_results"; -my $testdir = "ingredients"; + +my $test_name = "ingredients"; +my $tests_dir = dirname(__FILE__); +my $expected_dir = $tests_dir . "/expected_test_results/" . $test_name; my $usage = < \$resultsdir) +GetOptions ("update-expected-results" => \$update_expected_results) or die("Error in command line arguments.\n\n" . $usage); -if ((defined $resultsdir) and (! -e $resultsdir)) { - mkdir($resultsdir, 0755) or die("Could not create $resultsdir directory: $!\n"); + +if ((defined $update_expected_results) and (! -e $expected_dir)) { + mkdir($expected_dir, 0755) or die("Could not create $expected_dir directory: $!\n"); } my @tests = ( @@ -453,6 +457,65 @@ Teneur en citron de 5,5%", ingredients_text => "Peaches. Some unknown ingredient, another unknown ingredient. Origin of peaches: Spain. Origin of some unknown ingredient: France. origin of Another Unknown Ingredient: Malta", } + ], + + # Origins with commas + [ + "en-origin-field-with-commas", + { + lc => "en", + ingredients_text => "Milk, sugar. Origin of the milk: Belgium, Spain", + } + ], + + # Origins with commas + [ + "en-origin-field-with-commas-and", + { + lc => "en", + ingredients_text => "Milk, sugar. Origin of the milk: UK, European Union. Origin of sugar: Paraguay, Uruguay and Costa Rica.", + } + ], + + # Origins : X from Y + [ + "en-origin-ingredient-from-origin", + { + lc => "en", + ingredients_text => "Red peppers, yellow peppers", + origin_en => "Red peppers from Spain, Italy and France, Yellow peppers from South America", + } + ], + + # Origins : X from Y + [ + "en-origin-ingredient-origin-and-origin", + { + lc => "en", + ingredients_text => "Red peppers, yellow peppers", + origin_en => "Red peppers: Spain or South America, Yellow peppers: Mexico, Canada and California", + } + ], + + # Origins : French - X from Y + [ + "fr-origin-ingredient-origin-and-origin", + { + lc => "fr", + ingredients_text => "Pomme de Terre 47%, Porc 22%, Lait demi-écrémé (contient Lait) 5.5%, Crème liquide (contient Lait) 5.5%, Eau 5.5%, + Beurre (contient Lait) 2.7%, Moutarde à l'ancienne (contient Moutarde, Sulfites) 2.7%, Crème (contient Lait) 2.7%, Moutarde de Dijon (contient Moutarde, Sulfites) 2.7%, + Miel de fleurs 2.7%, Epices (contient Sésame) 0.55%, bouillon (contient Gluten, Lait, Céleri) 0.55%, Sel fin 0.14%", + origin_fr => "Pomme de Terre de France, Porc de France, Lait demi-écrémé de France, Crème liquide de France, Eau de France, Beurre de France, + Moutarde à l'ancienne de France, Crème de France, Moutarde de Dijon de France, Miel de fleurs de France, Epices : Inde, Bouillon de France, Sel fin de France", + } + ], + + [ + "en-vitamin", + { + lc => "en", + ingredients_text => "vitamin a, salt", + } ], ); @@ -473,26 +536,7 @@ foreach my $test_ref (@tests) { extract_ingredients_from_text($product_ref); - # Save the result - - if (defined $resultsdir) { - open (my $result, ">:encoding(UTF-8)", "$resultsdir/$testid.json") or die("Could not create $resultsdir/$testid.json: $!\n"); - print $result $json->pretty->encode($product_ref); - close ($result); - } - - # Compare the result with the expected result - - if (open (my $expected_result, "<:encoding(UTF-8)", "$expected_dir/$testdir/$testid.json")) { - - local $/; #Enable 'slurp' mode - my $expected_product_ref = $json->decode(<$expected_result>); - is_deeply ($product_ref, $expected_product_ref) or diag explain $product_ref; - } - else { - diag explain $product_ref; - fail("could not load expected_test_results/$testdir/$testid.json"); - } + compare_to_expected_results($product_ref, "$expected_dir/$testid.json", $update_expected_results); } diff --git a/t/tags_unit.t b/tests/unit/tags_unit.t similarity index 100% rename from t/tags_unit.t rename to tests/unit/tags_unit.t diff --git a/tests/update_tests_results.sh b/tests/update_tests_results.sh index b5fc8b4691a1b..6fc8b094f5c74 100755 --- a/tests/update_tests_results.sh +++ b/tests/update_tests_results.sh @@ -28,5 +28,5 @@ cd .. # Integration tests perl integration/import_csv_file.t --update-expected-results -perl export.t --update-expected-results +perl integration/export.t --update-expected-results