Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: parsing of dots in packagins / recycling instructions #7948

Merged
merged 3 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion lib/ProductOpener/Packaging.pm
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ sub parse_packaging_component_data_from_text_phrase ($text, $text_language) {
$text = $';
}

# We might have escaped dots and commas inside numbers from analyze_and_combine_packaging_data()
$text =~ s/(\d)\\(\.|\,)(\d)/$1$2$3/g;

# Also try to match the canonicalized form so that we can match the extended synonyms that are only available in canonicalized form
my $textid = get_string_id_for_lang($text_language, $text);

Expand Down Expand Up @@ -779,7 +782,13 @@ sub analyze_and_combine_packaging_data ($product_ref, $response_ref) {
# Packaging text field (populated by OCR of the packaging image and/or contributors or producers)
if (defined $product_ref->{packaging_text}) {

my @packaging_text_entries = split(/,|;|\n/, $product_ref->{packaging_text});
# Separate phrases by matching:
# . , ; and newlines
# but we want to keep commas and dots that are inside numbers (3.40 or 1,5)
# so we escape them first
my $packaging_text = $product_ref->{packaging_text};
$packaging_text =~ s/(\d)(\.|,)(\d)/$1\\$2$3/g;
my @packaging_text_entries = split(/(?<!\\)\.|(?<!\\),|;|\n/, $packaging_text);
push(@phrases, @packaging_text_entries);
$number_of_packaging_text_entries = scalar @packaging_text_entries;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"count" : 5,
"page" : "1",
"page" : 1,
"page_count" : 5,
"page_size" : 24,
"products" : [
Expand Down Expand Up @@ -61,7 +61,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -320,21 +320,21 @@
},
{
"ecoscore_material_score" : 76,
"ecoscore_shape_ratio" : "0.2",
"ecoscore_shape_ratio" : 0.2,
"material" : "en:steel",
"number_of_units" : 1,
"shape" : "en:lid"
},
{
"ecoscore_material_score" : 0,
"ecoscore_shape_ratio" : "0.1",
"ecoscore_shape_ratio" : 0.1,
"material" : "en:plastic",
"non_recyclable_and_non_biodegradable" : "maybe",
"number_of_units" : 1,
"shape" : "en:film"
}
],
"score" : "66.2",
"score" : 66.2,
"value" : -3
},
"production_system" : {
Expand Down Expand Up @@ -369,23 +369,23 @@
"entry_dates_tags" : "--ignore--",
"food_groups_tags" : [],
"forest_footprint_data" : {
"footprint_per_kg" : "0.0074375",
"footprint_per_kg" : 0.0074375,
"grade" : "a",
"ingredients" : [
{
"conditions_tags" : [],
"footprint_per_kg" : "0.0074375",
"footprint_per_kg" : 0.0074375,
"matching_tag_id" : "en:egg",
"percent" : "9.375",
"percent_estimate" : "9.375",
"percent" : 9.375,
"percent_estimate" : 9.375,
"processing_factor" : 1,
"tag_id" : "en:egg",
"tag_type" : "ingredients",
"type" : {
"deforestation_risk" : "0.68",
"deforestation_risk" : 0.68,
"name" : "Oeufs Importés",
"soy_feed_factor" : "0.035",
"soy_yield" : "0.3"
"soy_feed_factor" : 0.035,
"soy_yield" : 0.3
}
}
]
Expand All @@ -399,7 +399,7 @@
"ingredients" : [
{
"id" : "en:apple",
"percent_estimate" : "62.5",
"percent_estimate" : 62.5,
"percent_max" : 100,
"percent_min" : 25,
"rank" : 1,
Expand All @@ -409,7 +409,7 @@
},
{
"id" : "en:milk",
"percent_estimate" : "18.75",
"percent_estimate" : 18.75,
"percent_max" : 50,
"percent_min" : 0,
"rank" : 2,
Expand All @@ -419,8 +419,8 @@
},
{
"id" : "en:egg",
"percent_estimate" : "9.375",
"percent_max" : "33.3333333333333",
"percent_estimate" : 9.375,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"rank" : 3,
"text" : "eggs",
Expand All @@ -430,7 +430,7 @@
{
"from_palm_oil" : "yes",
"id" : "en:palm-oil",
"percent_estimate" : "9.375",
"percent_estimate" : 9.375,
"percent_max" : 25,
"percent_min" : 0,
"rank" : 4,
Expand Down Expand Up @@ -710,7 +710,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -1251,7 +1251,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -1533,23 +1533,23 @@
"entry_dates_tags" : "--ignore--",
"food_groups_tags" : [],
"forest_footprint_data" : {
"footprint_per_kg" : "0.0132222222222222",
"footprint_per_kg" : 0.0132222222222222,
"grade" : "a",
"ingredients" : [
{
"conditions_tags" : [],
"footprint_per_kg" : "0.0132222222222222",
"footprint_per_kg" : 0.0132222222222222,
"matching_tag_id" : "en:egg",
"percent" : "16.6666666666667",
"percent_estimate" : "16.6666666666667",
"percent" : 16.6666666666667,
"percent_estimate" : 16.6666666666667,
"processing_factor" : 1,
"tag_id" : "en:egg",
"tag_type" : "ingredients",
"type" : {
"deforestation_risk" : "0.68",
"deforestation_risk" : 0.68,
"name" : "Oeufs Importés",
"soy_feed_factor" : "0.035",
"soy_yield" : "0.3"
"soy_feed_factor" : 0.035,
"soy_yield" : 0.3
}
}
]
Expand All @@ -1563,17 +1563,17 @@
"ingredients" : [
{
"id" : "en:apple",
"percent_estimate" : "66.6666666666667",
"percent_estimate" : 66.6666666666667,
"percent_max" : 100,
"percent_min" : "33.3333333333333",
"percent_min" : 33.3333333333333,
"rank" : 1,
"text" : "apple",
"vegan" : "yes",
"vegetarian" : "yes"
},
{
"id" : "en:milk",
"percent_estimate" : "16.6666666666667",
"percent_estimate" : 16.6666666666667,
"percent_max" : 50,
"percent_min" : 0,
"rank" : 2,
Expand All @@ -1583,8 +1583,8 @@
},
{
"id" : "en:egg",
"percent_estimate" : "16.6666666666667",
"percent_max" : "33.3333333333333",
"percent_estimate" : 16.6666666666667,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"rank" : 3,
"text" : "eggs",
Expand Down Expand Up @@ -1696,8 +1696,8 @@
"nutrient_levels" : {},
"nutrient_levels_tags" : [],
"nutriments" : {
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : "33.3333333333333",
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : "33.3333333333333",
"fruits-vegetables-nuts-estimate-from-ingredients_100g" : 33.3333333333333,
"fruits-vegetables-nuts-estimate-from-ingredients_serving" : 33.3333333333333,
"nova-group" : 1,
"nova-group_100g" : 1,
"nova-group_serving" : 1
Expand Down Expand Up @@ -1827,7 +1827,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -2121,24 +2121,24 @@
"ingredients" : [
{
"id" : "es:apple",
"percent_estimate" : "66.6666666666667",
"percent_estimate" : 66.6666666666667,
"percent_max" : 100,
"percent_min" : "33.3333333333333",
"percent_min" : 33.3333333333333,
"rank" : 1,
"text" : "apple"
},
{
"id" : "es:water",
"percent_estimate" : "16.6666666666667",
"percent_estimate" : 16.6666666666667,
"percent_max" : 50,
"percent_min" : 0,
"rank" : 2,
"text" : "water"
},
{
"id" : "es:palm-oil",
"percent_estimate" : "16.6666666666667",
"percent_max" : "33.3333333333333",
"percent_estimate" : 16.6666666666667,
"percent_max" : 33.3333333333333,
"percent_min" : 0,
"rank" : 3,
"text" : "palm oil"
Expand Down Expand Up @@ -2396,7 +2396,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -2660,22 +2660,22 @@
},
"agribalyse" : {
"agribalyse_proxy_food_code" : "32135",
"co2_agriculture" : "2.2396004",
"co2_agriculture" : 2.2396004,
"co2_consumption" : 0,
"co2_distribution" : "0.019530673",
"co2_packaging" : "0.28159592",
"co2_processing" : "0.77051126",
"co2_total" : "3.564111983",
"co2_transportation" : "0.25287373",
"co2_distribution" : 0.019530673,
"co2_packaging" : 0.28159592,
"co2_processing" : 0.77051126,
"co2_total" : 3.564111983,
"co2_transportation" : 0.25287373,
"code" : "32135",
"dqr" : "4.03",
"ef_agriculture" : "0.50064279",
"ef_agriculture" : 0.50064279,
"ef_consumption" : 0,
"ef_distribution" : "0.0048315303",
"ef_packaging" : "0.023715692",
"ef_processing" : "0.078716079",
"ef_total" : "0.6323498093",
"ef_transportation" : "0.024443718",
"ef_distribution" : 0.0048315303,
"ef_packaging" : 0.023715692,
"ef_processing" : 0.078716079,
"ef_total" : 0.6323498093,
"ef_transportation" : 0.024443718,
"is_beverage" : 0,
"name_en" : "Breakfast cereals, mix of puffed or extruded cereals, fortified with vitamins and chemical elements",
"name_fr" : "Multi-céréales soufflées ou extrudées, enrichies en vitamines et minéraux",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"count" : 1,
"page" : "1",
"page" : 1,
"page_count" : 1,
"page_size" : 24,
"products" : [
Expand Down Expand Up @@ -74,7 +74,7 @@
"2xxxxxxxxxxx"
],
"complete" : 0,
"completeness" : "0.5",
"completeness" : 0.5,
"correctors_tags" : [],
"created_t" : "--ignore--",
"creator" : "tests",
Expand Down Expand Up @@ -338,22 +338,22 @@
},
"agribalyse" : {
"agribalyse_proxy_food_code" : "32135",
"co2_agriculture" : "2.2396004",
"co2_agriculture" : 2.2396004,
"co2_consumption" : 0,
"co2_distribution" : "0.019530673",
"co2_packaging" : "0.28159592",
"co2_processing" : "0.77051126",
"co2_total" : "3.564111983",
"co2_transportation" : "0.25287373",
"co2_distribution" : 0.019530673,
"co2_packaging" : 0.28159592,
"co2_processing" : 0.77051126,
"co2_total" : 3.564111983,
"co2_transportation" : 0.25287373,
"code" : "32135",
"dqr" : "4.03",
"ef_agriculture" : "0.50064279",
"ef_agriculture" : 0.50064279,
"ef_consumption" : 0,
"ef_distribution" : "0.0048315303",
"ef_packaging" : "0.023715692",
"ef_processing" : "0.078716079",
"ef_total" : "0.6323498093",
"ef_transportation" : "0.024443718",
"ef_distribution" : 0.0048315303,
"ef_packaging" : 0.023715692,
"ef_processing" : 0.078716079,
"ef_total" : 0.6323498093,
"ef_transportation" : 0.024443718,
"is_beverage" : 0,
"name_en" : "Breakfast cereals, mix of puffed or extruded cereals, fortified with vitamins and chemical elements",
"name_fr" : "Multi-céréales soufflées ou extrudées, enrichies en vitamines et minéraux",
Expand Down
Loading