diff --git a/lib/ProductOpener/Packaging.pm b/lib/ProductOpener/Packaging.pm index c040cce4a8abc..be0f954cdd656 100644 --- a/lib/ProductOpener/Packaging.pm +++ b/lib/ProductOpener/Packaging.pm @@ -159,6 +159,9 @@ sub parse_packaging_component_data_from_text_phrase ($text, $text_language) { $text = $'; } + # We might have escaped dots and commas inside numbers from analyze_and_combine_packaging_data() + $text =~ s/(\d)\\(\.|\,)(\d)/$1$2$3/g; + # Also try to match the canonicalized form so that we can match the extended synonyms that are only available in canonicalized form my $textid = get_string_id_for_lang($text_language, $text); @@ -779,7 +782,13 @@ sub analyze_and_combine_packaging_data ($product_ref, $response_ref) { # Packaging text field (populated by OCR of the packaging image and/or contributors or producers) if (defined $product_ref->{packaging_text}) { - my @packaging_text_entries = split(/,|;|\n/, $product_ref->{packaging_text}); + # Separate phrases by matching: + # . , ; and newlines + # but we want to keep commas and dots that are inside numbers (3.40 or 1,5) + # so we escape them first + my $packaging_text = $product_ref->{packaging_text}; + $packaging_text =~ s/(\d)(\.|,)(\d)/$1\\$2$3/g; + my @packaging_text_entries = split(/(? "fr", - packaging_text => "barquette en plastique à jeter;film plastique à jeter; boîte en carton à recycler" + packaging_text => "barquette en plastique à jeter; film plastique à jeter; boîte en carton à recycler" } ], [ @@ -517,6 +517,33 @@ boîte en carton à recycler" } ], + # dots were not parsed correctly + [ + 'fr-dot-to-separate-components', + { + lc => "fr", + packaging_text => "Film plastique à jeter. Étui carton à recycler.", + } + ], + + # comma inside a number: don't split + [ + 'fr-comma-inside-a-number', + { + lc => "fr", + packaging_text => "6 bouteilles en plastique transparent PET de 1,5 L à recycler", + } + ], + + # comma without spaces, not in a number: split + [ + 'fr-comma-without-space', + { + lc => "fr", + packaging_text => "1 boîte en métal,4 bouteilles (plastique).", + } + ], + ); my $json = JSON->new->allow_nonref->canonical;