From f8758ed1c781d1b4a673038728df57167f12418c Mon Sep 17 00:00:00 2001 From: off Date: Mon, 12 Dec 2022 15:46:16 +0100 Subject: [PATCH 1/8] =?UTF-8?q?added=20xx:EG-=C3=96ko-Verordnung,=20made?= =?UTF-8?q?=20de:EG-=C3=96ko-Verordnung=20the=20canonical=20name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- taxonomies/labels.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/taxonomies/labels.txt b/taxonomies/labels.txt index e5485c22d56a5..ef087e90f0218 100644 --- a/taxonomies/labels.txt +++ b/taxonomies/labels.txt @@ -3717,13 +3717,10 @@ country:en:Belgium image:en:biogarantie-bel.90x90.svg Date: Mon, 12 Dec 2022 15:47:09 +0100 Subject: [PATCH 2/8] small changes for Bayard --- lib/ProductOpener/GS1.pm | 1093 ++++++++++++++++---------------------- 1 file changed, 459 insertions(+), 634 deletions(-) diff --git a/lib/ProductOpener/GS1.pm b/lib/ProductOpener/GS1.pm index eba83b03a6f63..4c405ea8fd31e 100644 --- a/lib/ProductOpener/GS1.pm +++ b/lib/ProductOpener/GS1.pm @@ -42,12 +42,14 @@ And the %gs1_maps translate the GS1 specific identifiers (e.g. for allergens or package ProductOpener::GS1; use ProductOpener::PerlStandards; -use Exporter qw< import >; +use Exporter qw< import >; use Log::Any qw($log); -BEGIN { - use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); + +BEGIN +{ + use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); @EXPORT_OK = qw( %gs1_maps @@ -63,7 +65,7 @@ BEGIN { %EXPORT_TAGS = (all => [@EXPORT_OK]); } -use vars @EXPORT_OK; +use vars @EXPORT_OK ; use ProductOpener::Config qw/:all/; use ProductOpener::Tags qw/:all/; @@ -73,6 +75,7 @@ use JSON::PP; use boolean; use Data::DeepAccess qw(deep_get); + =head1 GS1 MAPS GS1 uses many different codes for allergens, packaging etc. @@ -90,6 +93,7 @@ Maps from GS1 to OFF =cut + my %unknown_entries_in_gs1_maps = (); # see https://www.gs1.fr/content/download/2265/17736/version/3/file/FicheProduit3.1.9_PROFIL_ParfumerieSelective_20190523.xlsx @@ -133,7 +137,7 @@ my %unknown_entries_in_gs1_maps = (); # "UN" => "Shellfish", "UW" => "Wheat", }, - + measurementUnitCode => { "GRM" => "g", "KGM" => "kg", @@ -146,7 +150,7 @@ my %unknown_entries_in_gs1_maps = (); "KJO" => "kJ", "H87" => "pièces", }, - + # reference: GS1 T4073 Nutrient type code # https://gs1.se/en/guides/documentation/test-code-lists/t4073-nutrient-type-code/ nutrientTypeCode => { @@ -154,7 +158,7 @@ my %unknown_entries_in_gs1_maps = (); "CA" => "calcium", "CASN" => "casein", "CHOAVL" => "carbohydrates", - "CHOCAL" => "vitamin-d", # cholecalciferol + "CHOCAL" => "vitamin-d", # cholecalciferol "CHOLN" => "choline", "CLD" => "chloride", "CR" => "chromium", @@ -205,8 +209,8 @@ my %unknown_entries_in_gs1_maps = (); "NUCLEOTIDE" => "nucleotide", "P" => "phosphorus", "PANTAC" => "pantothenic-acid", - "POLYL" => "polyols", - "POLYLS" => "polyols", + "POLYL" => "polyols", + "POLYLS" => "polyols", "PRO-" => "proteins", "RIBF" => "vitamin-b2", "SALTEQ" => "salt", @@ -230,7 +234,7 @@ my %unknown_entries_in_gs1_maps = (); # skipped X_ entries such as X_ACAI_BERRY_EXTRACT "ZN" => "zinc", }, - + packagingTypeCode => { "AE" => "Aérosol", "BA" => "Tonneau", @@ -250,8 +254,8 @@ my %unknown_entries_in_gs1_maps = (); "PUG" => "Sac de transport", "TU" => "Tube", "WRP" => "Film", - }, - + }, + packagingTypeCode_unused_not_taxonomized_yet => { "AE" => "en:aerosol", "BA" => "en:barrel", @@ -270,8 +274,8 @@ my %unknown_entries_in_gs1_maps = (); "PUG" => "en:carrying-bag", "TU" => "en:tube", "WRP" => "en:film", - }, - + }, + # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:PackagingMarkedLabelAccreditationCode packagingMarkedLabelAccreditationCode => { "ADCCPA" => "fr:produit-certifie", @@ -284,6 +288,7 @@ my %unknown_entries_in_gs1_maps = (); "APPELLATION_ORIGINE_CONTROLEE" => "fr:aoc", "AQUACULTURE_STEWARDSHIP_COUNCIL" => "en:responsible-aquaculture-asc", "BLEU_BLANC_COEUR" => "fr:bleu-blanc-coeur", + "BIO_LABEL_GERMAN" => "de:EG-Öko-Verordnung", "BIO_PARTENAIRE" => "fr:biopartenaire", "CROSSED_GRAIN_SYMBOL" => "en:crossed-grain-symbol", "DEMETER" => "en:demeter", @@ -298,7 +303,7 @@ my %unknown_entries_in_gs1_maps = (); "FAIR_TRADE_MARK" => "en:fairtrade-international", "FAIRTRADE_COCOA" => "en:fair-trade", "FAIR_TRADE_USA" => "en:fairtrade-usa", - "FOREST_STEWARDSHIP_COUNCIL_LABEL" => "en:fsc", + "FOREST_STEWARDSHIP_COUNCIL_LABEL" => "en:fsc", "FOREST_STEWARDSHIP_COUNCIL_MIX" => "en:fsc-mix", "FOREST_STEWARDSHIP_COUNCIL_RECYCLED" => "en:fsc-recycled", "GREEN_DOT" => "en:green-dot", @@ -326,14 +331,16 @@ my %unknown_entries_in_gs1_maps = (); "VOLAILLE_FRANCAISE" => "en:french-poultry", }, + # https://gs1.se/en/guides/documentation/code-lists/t3783-target-market-country-code/ targetMarketCountryCode => { "250" => "en:france", + "276" => "en:germany", }, - + timeMeasurementUnitCode => { "MON" => "month", "DAY" => "day", - }, + }, ); # Normalize some entries @@ -345,9 +352,8 @@ foreach my $tag (sort keys %{$gs1_maps{allergenTypeCode}}) { } else { $log->error("gs1_maps - entry not in taxonomy", - {tagtype => "allergens", tag => $gs1_maps{allergenTypeCode}{$tag}}) - if $log->is_error(); - die; + { tagtype => "allergens", tag => $gs1_maps{allergenTypeCode}{$tag} }) if $log->is_error(); + die; } } @@ -358,12 +364,12 @@ foreach my $tag (sort keys %{$gs1_maps{packagingMarkedLabelAccreditationCode}}) } else { $log->error("gs1_maps - entry not in taxonomy", - {tagtype => "labels", tag => $gs1_maps{packagingMarkedLabelAccreditationCode}{$tag}}) - if $log->is_error(); - die; + { tagtype => "labels", tag => $gs1_maps{packagingMarkedLabelAccreditationCode}{$tag} }) if $log->is_error(); + die; } } + =head2 %gs1_message_to_off Defines the structure of the GS1 message data and how to extract the fields useful to create a message confirmation. @@ -374,62 +380,40 @@ my %gs1_message_to_off = ( fields => [ - [ - "catalogue_item_notification:catalogueItemNotificationMessage", - { + ["catalogue_item_notification:catalogueItemNotificationMessage", { fields => [ - [ - "sh:StandardBusinessDocumentHeader", - { + ["sh:StandardBusinessDocumentHeader", { fields => [ ], } ], - [ - "transaction", - { + ["transaction", { fields => [ - [ - "transactionIdentification", - { + ["transactionIdentification", { fields => [ ["entityIdentification", "transactionIdentification_entityIdentification"], - [ - "contentOwner", - { - fields => [["gln", "transactionIdentification_contentOwner_gln"],], + ["contentOwner", { + fields => [ + ["gln", "transactionIdentification_contentOwner_gln"], + ], } ], ], }, ], - [ - "documentCommand", - { + ["documentCommand", { fields => [ - [ - "documentCommandHeader", - { + ["documentCommandHeader", { fields => [ - [ - "documentCommandIdentification", - { + ["documentCommandIdentification", { fields => [ - [ - "entityIdentification", - "documentCommandIdentification_entityIdentification" - ], - [ - "contentOwner", - { + ["entityIdentification", "documentCommandIdentification_entityIdentification"], + ["contentOwner", { fields => [ - [ - "gln", - "documentCommandIdentification_contentOwner_gln" - ], + ["gln", "documentCommandIdentification_contentOwner_gln"], ], } ], @@ -441,56 +425,30 @@ my %gs1_message_to_off = ( }, ], - [ - "catalogue_item_notification:catalogueItemNotification", - { + ["catalogue_item_notification:catalogueItemNotification", { fields => [ - [ - "creationDateTime", - "catalogueItemNotification_creationDateTime" - ], - [ - "documentStatusCode", - "catalogueItemNotification_documentStatusCode" - ], - [ - "catalogueItemNotificationIdentification", - { + ["creationDateTime", "catalogueItemNotification_creationDateTime"], + ["documentStatusCode", "catalogueItemNotification_documentStatusCode"], + ["catalogueItemNotificationIdentification", { fields => [ - [ - "entityIdentification", - "catalogueItemNotificationIdentification_entityIdentification" - ], - [ - "contentOwner", - { + ["entityIdentification", "catalogueItemNotificationIdentification_entityIdentification"], + ["contentOwner", { fields => [ - [ - "gln", - "catalogueItemNotificationIdentification_contentOwner_gln" - ], + ["gln", "catalogueItemNotificationIdentification_contentOwner_gln"], ], } ], ], }, ], - [ - "catalogueItem", - { + ["catalogueItem", { fields => [ - [ - "tradeItem", - { + ["tradeItem", { fields => [ ["gtin", "gtin"], - [ - "targetMarket", - { + ["targetMarket", { fields => [ - ["targetMarketCountryCode", - "targetMarketCountryCode" - ], + ["targetMarketCountryCode", "targetMarketCountryCode"], ], }, ], @@ -515,6 +473,7 @@ my %gs1_message_to_off = ( ], ); + =head2 %gs1_product_to_off Defines the structure of the GS1 product data and how it maps to the OFF data. @@ -523,17 +482,17 @@ Defines the structure of the GS1 product data and how it maps to the OFF data. my %gs1_product_to_off = ( - match => [["isTradeItemAConsumerUnit", "true"],], + match => [ + ["isTradeItemAConsumerUnit", "true"], + ], fields => [ - + # source_field => target_field : assign the value of the source field to the target field ["gtin", "code"], # source_field => source_hash : go down one level - [ - "brandOwner", - { + ["brandOwner", { fields => [ ["gln", "sources_fields:org-gs1:gln"], # source_field => target_field1,target_field2 : assign value of the source field to multiple target fields @@ -541,22 +500,18 @@ my %gs1_product_to_off = ( ], }, ], - - [ - "gdsnTradeItemClassification", - { + + ["gdsnTradeItemClassification", { fields => [ ["gpcCategoryCode", "sources_fields:org-gs1:gpcCategoryCode"], # not always present and could be in different languages ["gpcCategoryName", "sources_fields:org-gs1:gpcCategoryName, +categories_if_match_in_taxonomy"], ], }, - ], - + ], + # will override brandOwner values if present - [ - "informationProviderOfTradeItem", - { + ["informationProviderOfTradeItem", { fields => [ ["gln", "sources_fields:org-gs1:gln"], # source_field => target_field1,target_field2 : assign value of the source field to multiple target fields @@ -564,167 +519,143 @@ my %gs1_product_to_off = ( ], }, ], - - [ - "targetMarket", - { - fields => [["targetMarketCountryCode", "countries%targetMarketCountryCode"],], + + ["targetMarket", { + fields => [ + ["targetMarketCountryCode", "countries%targetMarketCountryCode"], + ], }, ], - + # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 # source_field => array of hashes: go down one level, expect an array - [ - "tradeItemContactInformation", - [ + ["tradeItemContactInformation", [ { # match => hash of key value conditions: assign values to field only if the conditions match - match => [["contactTypeCode", "CXC"],], - fields => [["contactName", "customer_service_fr"], ["contactAddress", "+customer_service_fr"],], + match => [ + ["contactTypeCode", "CXC"], + ], + fields => [ + ["contactName", "customer_service_fr"], + ["contactAddress", "+customer_service_fr"], + ], }, ], ], - - [ - "tradeItemInformation", + + ["tradeItemInformation", { fields => [ # Sometimes contains strings like "Signal CLAY&CHARCOAL DENTIFRICE 75 ML", not a good fit for the producer_version_id # but other time contains strings that look like internal version ids / item ids (e.g. "44041392") - [ - "productionVariantDescription", - "sources_fields:org-gs1:productionVariantDescription, producer_version_id" - ], - - [ - "extension", - { + ["productionVariantDescription", "sources_fields:org-gs1:productionVariantDescription, producer_version_id"], + + ["extension", { fields => [ - [ - "alcohol_information:alcoholInformationModule", - { + ["alcohol_information:alcoholInformationModule", { fields => [ - [ - "alcoholInformation", - { - fields => [["percentageOfAlcoholByVolume", "alcohol_100g_value"],], + ["alcoholInformation", { + fields => [ + ["percentageOfAlcoholByVolume", "alcohol_100g_value"], + ], }, ], ], }, - ], - - [ - "allergen_information:allergenInformationModule", - { + ], + + ["allergen_information:allergenInformationModule", { fields => [ - [ - "allergenRelatedInformation", - { + ["allergenRelatedInformation", { fields => [ - [ - "allergen", - [ + ["allergen", [ { - match => [["levelOfContainmentCode", "CONTAINS"],], + match => [ + ["levelOfContainmentCode", "CONTAINS"], + ], fields => [ # source_field => +target_field' : add to field, separate with commas if field is not empty # source_field => target_field%map_id : map the target value using the specified map_id # (do not assign a value if there is no corresponding entry in the map) - [ - 'allergenTypeCode', - '+allergens%allergenTypeCode' - ], + ['allergenTypeCode', '+allergens%allergenTypeCode'], ], }, { - match => - [["levelOfContainmentCode", "MAY_CONTAIN"],], + match => [ + ["levelOfContainmentCode", "MAY_CONTAIN"], + ], fields => [ # source_field => +target_field' : add to field, separate with commas if field is not empty # source_field => target_field%map_id : map the target value using the specified map_id # (do not assign a value if there is no corresponding entry in the map) - [ - 'allergenTypeCode', - '+traces%allergenTypeCode' - ], + ['allergenTypeCode', '+traces%allergenTypeCode'], ], }, ], ], - [ - "isAllergenRelevantDataProvided", - "sources_fields:org-gs1:isAllergenRelevantDataProvided" - ], + ["isAllergenRelevantDataProvided", "sources_fields:org-gs1:isAllergenRelevantDataProvided"], ], }, ], ], }, ], - - [ - "nutritional_information:nutritionalInformationModule", - { + + ["nutritional_information:nutritionalInformationModule", { fields => [ - ["nutrientHeader"], # nutrients are handled specially with specific code + ["nutrientHeader"], # nutrients are handled specially with specific code ], }, ], - - [ - "consumer_instructions:consumerInstructionsModule", - { + + ["consumer_instructions:consumerInstructionsModule", { fields => [ - [ - "consumerInstructions", - { - fields => - [["consumerStorageInstructions", "conservation_conditions"],], + ["consumerInstructions", { + fields => [ + ["consumerStorageInstructions", "conservation_conditions"], + ], }, ], ], } ], - - [ - "food_and_beverage_ingredient:foodAndBeverageIngredientModule", - { - fields => [["ingredientStatement", "ingredients_text"],], + + ["food_and_beverage_ingredient:foodAndBeverageIngredientModule", { + fields => [ + ["ingredientStatement", "ingredients_text"], + ], }, ], - - [ - "nonfood_ingredient:nonfoodIngredientModule", - { - fields => [["nonfoodIngredientStatement", "ingredients_text"],], + + ["nonfood_ingredient:nonfoodIngredientModule", { + fields => [ + ["nonfoodIngredientStatement", "ingredients_text"], + ], }, - ], - - [ - "food_and_beverage_preparation_serving:foodAndBeveragePreparationServingModule", - { + ], + + ["food_and_beverage_preparation_serving:foodAndBeveragePreparationServingModule", { fields => [ - [ - "preparationServing", - { - fields => [["preparationInstructions", "preparation"],], + ["preparationServing", { + fields => [ + ["preparationInstructions", "preparation"], + ], }, ], ], }, ], - - [ - "health_related_information:healthRelatedInformationModule", - { + + ["health_related_information:healthRelatedInformationModule", { fields => [ - [ - "healthRelatedInformation", - { - match => [["nutritionalProgramCode", "8"],], - fields => [["nutritionalScore", "nutriscore_grade_producer"],], + ["healthRelatedInformation", { + match => [ + ["nutritionalProgramCode","8"], + ], + fields => [ + ["nutritionalScore", "nutriscore_grade_producer"], + ], }, ], ], @@ -732,18 +663,14 @@ my %gs1_product_to_off = ( ], # 2021-12-20: it looks like the nutritionalProgramCode is now in an extra nutritionProgram field - [ - "health_related_information:healthRelatedInformationModule", - { + ["health_related_information:healthRelatedInformationModule", { fields => [ - [ - "healthRelatedInformation", - { + ["healthRelatedInformation", { fields => [ - [ - "nutritionalProgram", - { - match => [["nutritionalProgramCode", "8"],], + ["nutritionalProgram", { + match => [ + ["nutritionalProgramCode","8"], + ], fields => [ ["nutritionalScore", "nutriscore_grade_producer"], ], @@ -755,34 +682,25 @@ my %gs1_product_to_off = ( ], }, ], - - [ - "packaging_information:packagingInformationModule", - { + + ["packaging_information:packagingInformationModule", { fields => [ - [ - "packaging", - { - fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],], + ["packaging", { + fields => [ + ["packagingTypeCode", "+packaging%packagingTypeCode"], + ], }, ], ], }, - ], - - [ - "packaging_marking:packagingMarkingModule", - { + ], + + ["packaging_marking:packagingMarkingModule", { fields => [ - [ - "packagingMarking", - { + ["packagingMarking", { fields => [ # the source can be an array if there are multiple labels - [ - "packagingMarkedLabelAccreditationCode", - "+labels%packagingMarkedLabelAccreditationCode" - ], + ["packagingMarkedLabelAccreditationCode", "+labels%packagingMarkedLabelAccreditationCode"], ], }, ], @@ -790,13 +708,9 @@ my %gs1_product_to_off = ( }, ], - [ - "place_of_item_activity:placeOfItemActivityModule", - { + ["place_of_item_activity:placeOfItemActivityModule", { fields => [ - [ - "placeOfProductActivity", - { + ["placeOfProductActivity", { fields => [ # provenanceStatement is a free text field, which can contain manufacturing places # and/or origins of ingredients and related statements, in different languages @@ -806,44 +720,42 @@ my %gs1_product_to_off = ( ], ], }, - ], - - [ - "referenced_file_detail_information:referencedFileDetailInformationModule", - { + ], + + ["referenced_file_detail_information:referencedFileDetailInformationModule", { fields => [ - [ - "referencedFileHeader", - [ + ["referencedFileHeader", [ { - match => [["isPrimaryFile", "TRUE"],], - fields => [["uniformResourceIdentifier", "image_front_url"],], + match => [ + ["isPrimaryFile", "TRUE"], + ], + fields => [ + ["uniformResourceIdentifier", "image_front_url"], + ], }, { - does_not_match => [["isPrimaryFile", "TRUE"],], - fields => [["uniformResourceIdentifier", "+image_other_url"],], + does_not_match => [ + ["isPrimaryFile", "TRUE"], + ], + fields => [ + ["uniformResourceIdentifier", "+image_other_url"], + ], }, ], ], ], }, - ], - - [ - "trade_item_description:tradeItemDescriptionModule", - { + ], + + ["trade_item_description:tradeItemDescriptionModule", { fields => [ - [ - "tradeItemDescriptionInformation", - { + ["tradeItemDescriptionInformation", { fields => [ ["descriptionShort", "abbreviated_product_name"], ["functionalName", "+categories_if_match_in_taxonomy"], ["regulatedProductName", "generic_name"], ["tradeItemDescription", "product_name"], - [ - "brandNameInformation", - { + ["brandNameInformation", { fields => [ ['brandName' => '+brands'], ['subBrand' => '+brands'], @@ -856,20 +768,16 @@ my %gs1_product_to_off = ( ], }, ], - - [ - "trade_item_measurements:tradeItemMeasurementsModule", - { + + ["trade_item_measurements:tradeItemMeasurementsModule", { fields => [ - [ - "tradeItemMeasurements", - { + ["tradeItemMeasurements", { fields => [ ["netContent", "quantity"], - [ - "tradeItemWeight", - { - fields => [["netWeight", "net_weight"],], + ["tradeItemWeight", { + fields => [ + ["netWeight", "net_weight"], + ], }, ], ], @@ -877,15 +785,11 @@ my %gs1_product_to_off = ( ], ], }, - ], - - [ - "trade_item_lifespan:tradeItemLifespanModule", - { + ], + + ["trade_item_lifespan:tradeItemLifespanModule", { fields => [ - [ - "tradeItemLifespan", - { + ["tradeItemLifespan", { fields => [ # the source can be an array if there are multiple labels ["itemPeriodSafeToUseAfterOpening", "+periods_after_opening"], @@ -901,13 +805,10 @@ my %gs1_product_to_off = ( ], }, ], - - [ - "tradeItemSynchronisationDates", - { + + ["tradeItemSynchronisationDates", { fields => [ - ["publicationDateTime", "sources_fields:org-gs1:publicationDateTime"] - , # Not available in CodeOnline export + ["publicationDateTime", "sources_fields:org-gs1:publicationDateTime"], # Not available in CodeOnline export ["lastChangeDateTime", "sources_fields:org-gs1:lastChangeDateTime"], ], }, @@ -915,6 +816,7 @@ my %gs1_product_to_off = ( ], ); + =head1 FUNCTIONS =head2 init_csv_fields () @@ -929,10 +831,11 @@ my @csv_fields = (); sub init_csv_fields() { %seen_csv_fields = (); - @csv_fields = (); + @csv_fields = (); return; } + =head2 assign_field ( $results_ref $target_field $target_value) Used to assign a value to a field, and keep track of the order of the fields we are matching, @@ -940,10 +843,10 @@ so that we can output the fields in the same order when we export a CSV. =cut -sub assign_field ($results_ref, $target_field, $target_value) { - +sub assign_field($results_ref, $target_field, $target_value) { + $results_ref->{$target_field} = $target_value; - + if (not defined $seen_csv_fields{$target_field}) { push @csv_fields, $target_field; $seen_csv_fields{$target_field} = 1; @@ -951,7 +854,8 @@ sub assign_field ($results_ref, $target_field, $target_value) { return; } -sub extract_nutrient_quantity_contained ($type, $per, $results_ref, $nid, $nutrient_detail_ref) { + +sub extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrient_detail_ref ) { my $nutrient_field = $nid . $type . "_" . $per; @@ -963,9 +867,7 @@ sub extract_nutrient_quantity_contained ($type, $per, $results_ref, $nid, $nutri # - Equadis has 2 ENER- nutrientDetail, each with a single quantityContained hash # - Agena3000 has 1 ENER- nutrientDetail with an array of 2 quantityContained # --> convert a single hash to an array with a hash - if ( (defined $nutrient_detail_ref->{quantityContained}) - and (ref($nutrient_detail_ref->{quantityContained}) ne "ARRAY")) - { + if ((defined $nutrient_detail_ref->{quantityContained}) and (ref($nutrient_detail_ref->{quantityContained}) ne "ARRAY")) { $nutrient_detail_ref->{quantityContained} = [$nutrient_detail_ref->{quantityContained}]; } @@ -980,14 +882,13 @@ sub extract_nutrient_quantity_contained ($type, $per, $results_ref, $nid, $nutri $nutrient_unit = $gs1_maps{measurementUnitCode}{$quantity_contained_ref->{measurementUnitCode}}; } else { - $log->error("gs1_to_off - unrecognized quantity contained", {quantityContained => $quantity_contained_ref}) - if $log->is_error(); + $log->error("gs1_to_off - unrecognized quantity contained", + { quantityContained => $quantity_contained_ref }) if $log->is_error(); } # less than < modifier - if ( (defined $nutrient_detail_ref->{measurementPrecisionCode}) - and ($nutrient_detail_ref->{measurementPrecisionCode} eq "LESS_THAN")) - { + if ((defined $nutrient_detail_ref->{measurementPrecisionCode}) + and ($nutrient_detail_ref->{measurementPrecisionCode} eq "LESS_THAN")) { $nutrient_value = "< " . $nutrient_value; } @@ -1007,6 +908,7 @@ sub extract_nutrient_quantity_contained ($type, $per, $results_ref, $nid, $nutri return; } + =head2 gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) Recursive function to go through all first level keys of the $gs1_to_off_ref mapping. @@ -1025,113 +927,100 @@ The same hash reference is passed to recursive calls to the gs1_to_off function. =cut + sub gs1_to_off; sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # We should have a hash if (ref($json_ref) ne "HASH") { - $log->error("gs1_to_off - json_ref is not a hash", - {gs1_to_off_ref => $gs1_to_off_ref, json_ref => $json_ref, results_ref => $results_ref}) - if $log->is_error(); + $log->error("gs1_to_off - json_ref is not a hash", { gs1_to_off_ref => $gs1_to_off_ref, json_ref => $json_ref, results_ref => $results_ref }) if $log->is_error(); return; } - - $log->debug("gs1_to_off", {json_ref_keys => [sort keys %$json_ref]}) if $log->is_debug(); - + + $log->debug("gs1_to_off", { json_ref_keys => [sort keys %$json_ref] }) if $log->is_debug(); + # Check the matching conditions if any - + if (defined $gs1_to_off_ref->{match}) { - - $log->debug("gs1_to_off - checking conditions", {match => $gs1_to_off_ref->{match}}) if $log->is_debug(); - + + $log->debug("gs1_to_off - checking conditions", { match => $gs1_to_off_ref->{match} } ) if $log->is_debug(); + foreach my $match_field_ref (@{$gs1_to_off_ref->{match}}) { - + my $match_field = $match_field_ref->[0]; my $match_value = $match_field_ref->[1]; - - if ( (not defined $json_ref->{$match_field}) - or ($json_ref->{$match_field} ne $match_value)) - { - - $log->debug( - "gs1_to_off - condition does not match", - { - match_field => $match_field, + + if ((not defined $json_ref->{$match_field}) + or ($json_ref->{$match_field} ne $match_value)) { + + $log->debug("gs1_to_off - condition does not match", + { match_field => $match_field, match_value => $match_value, - actual_value => $json_ref->{$match_field} - } - ) if $log->is_debug(); - + actual_value => $json_ref->{$match_field} }) if $log->is_debug(); + return; - } + } } - + $log->debug("gs1_to_off - conditions match") if $log->is_debug(); } - + # Check the matching exceptions - + if (defined $gs1_to_off_ref->{does_not_match}) { - - $log->debug("gs1_to_off - checking conditions", {does_not_match => $gs1_to_off_ref->{does_not_match}}) - if $log->is_debug(); - + + $log->debug("gs1_to_off - checking conditions", { does_not_match => $gs1_to_off_ref->{does_not_match} } ) if $log->is_debug(); + my $match = 1; - + foreach my $match_field_ref (@{$gs1_to_off_ref->{does_not_match}}) { - + my $match_field = $match_field_ref->[0]; my $match_value = $match_field_ref->[1]; - - if ( (not defined $json_ref->{$match_field}) - or ($json_ref->{$match_field} ne $match_value)) - { - - $log->debug( - "gs1_to_off - condition does not match", - { - match_field => $match_field, + + if ((not defined $json_ref->{$match_field}) + or ($json_ref->{$match_field} ne $match_value)) { + + $log->debug("gs1_to_off - condition does not match", + { match_field => $match_field, match_value => $match_value, - actual_value => $json_ref->{$match_field} - } - ) if $log->is_debug(); - + actual_value => $json_ref->{$match_field} }) if $log->is_debug(); + $match = 0; last; - } + } } - + return if $match; - + $log->debug("gs1_to_off - conditions match") if $log->is_debug(); - } - + } + $log->debug("gs1_to_off - assigning fields") if $log->is_debug(); - + # If the conditions match, assign the fields foreach my $source_field_ref (@{$gs1_to_off_ref->{fields}}) { - + my $source_field = $source_field_ref->[0]; my $source_target = $source_field_ref->[1]; - - $log->debug("gs1_to_off - source fields", {source_field => $source_field}) if $log->is_debug(); - + + $log->debug("gs1_to_off - source fields", { source_field => $source_field }) if $log->is_debug(); + if (defined $json_ref->{$source_field}) { - + $log->debug("gs1_to_off - existing source fields", - {source_field => $source_field, ref => ref($source_target)}) - if $log->is_debug(); - + { source_field => $source_field, ref => ref($source_target) }) if $log->is_debug(); + # if the source field is nutrientHeader, we need to extract multiple # nutrition facts tables (for unprepared and prepared product) # with multiple nutrients. # As the mapping is complex, it is done with special code below # instead of the generic matching code. - + if ($source_field eq "nutrientHeader") { - + $log->debug("gs1_to_off - special handling for nutrientHeader array") if $log->is_debug(); # If there is only one nutrition facts table, nutrientHeader might not be an array @@ -1140,39 +1029,36 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { if (ref($json_ref->{$source_field}) eq 'HASH') { $json_ref->{$source_field} = [$json_ref->{$source_field}]; } - + # Some products like ice cream may have nutrients per 100g + nutrients per 100ml # in that case, the last values (e.g. for 100g) will override previous values (e.g. for 100ml) - + foreach my $nutrient_header_ref (@{$json_ref->{$source_field}}) { - + my $type = ""; - + if ($nutrient_header_ref->{preparationStateCode} eq "PREPARED") { $type = "_prepared"; } - + my $serving_size_value; my $serving_size_unit; my $serving_size_description; my $serving_size_description_lc; - + if (defined $nutrient_header_ref->{servingSize}{'#'}) { $serving_size_value = $nutrient_header_ref->{servingSize}{'#'}; - $serving_size_unit = $gs1_maps{measurementUnitCode} - {$nutrient_header_ref->{servingSize}{'@'}{measurementUnitCode}}; + $serving_size_unit = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{'@'}{measurementUnitCode}}; } elsif (defined $nutrient_header_ref->{servingSize}{'$t'}) { $serving_size_value = $nutrient_header_ref->{servingSize}{'$t'}; - $serving_size_unit - = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{measurementUnitCode}}; + $serving_size_unit = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{measurementUnitCode}}; } else { $log->error("gs1_to_off - unrecognized serving size", - {servingSize => $nutrient_header_ref->{servingSize}}) - if $log->is_error(); + { servingSize => $nutrient_header_ref->{servingSize} }) if $log->is_error(); } - + # We may have a servingSizeDescription in multiple languages, in that case, take the first one if (defined $nutrient_header_ref->{servingSizeDescription}) { @@ -1197,20 +1083,19 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { } } } - + my $per = "100g"; - + if ((defined $serving_size_value) and ($serving_size_value != 100)) { $per = "serving"; - $serving_size_value += 0; # remove extra .0 - + $serving_size_value += 0; # remove extra .0 + # Some serving sizes have an extra description # e.g. par portion : 14 g + 200 ml d'eau my $extra_serving_size_description = ""; if ((defined $serving_size_description) and (defined $serving_size_description_lc)) { # Par Portion de 30 g (2) - $serving_size_description - =~ s/^(par |pour )?((1 |une )?(part |portion ))?(de )?\s*:?=?\s*//i; + $serving_size_description =~ s/^(par |pour )?((1 |une )?(part |portion ))?(de )?\s*:?=?\s*//i; $serving_size_description =~ s/( |\d)(gr|grammes)$/$1g/i; # Par Portion de 30 g (2) : remove number of portions $serving_size_description =~ s/\(\d+\)//i; @@ -1219,110 +1104,93 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # skip the extra description if it is equal to value + unit # to avoid things like 43 g (43 g) # "Pour 45g ?²?" --> ignore bogus characters at the end - if ( - ($serving_size_description !~ /^\s*$/) - and ($serving_size_description - !~ /^$serving_size_value\s*$serving_size_unit(\?|\.|\,|\s|\*|²)*$/i) - ) - { + if (($serving_size_description !~ /^\s*$/) + and ($serving_size_description !~ /^$serving_size_value\s*$serving_size_unit(\?|\.|\,|\s|\*|²)*$/i)) { $extra_serving_size_description = ' (' . $serving_size_description . ')'; } } - - assign_field($results_ref, "serving_size", - $serving_size_value . " " . $serving_size_unit . $extra_serving_size_description); + + assign_field($results_ref, "serving_size", $serving_size_value . " " . $serving_size_unit . $extra_serving_size_description); } - + if (defined $nutrient_header_ref->{nutrientDetail}) { - + # If there's only one nutrient, we may not get an array - + if (ref($nutrient_header_ref->{nutrientDetail}) ne 'ARRAY') { - $log->error("gs1_to_off - nutrient_header is not an array ", {results_ref => $results_ref}) - if $log->is_error(); + $log->error("gs1_to_off - nutrient_header is not an array ", { results_ref => $results_ref }) if $log->is_error(); next; } - + foreach my $nutrient_detail_ref (@{$nutrient_header_ref->{nutrientDetail}}) { my $nid = $gs1_maps{nutrientTypeCode}{$nutrient_detail_ref->{nutrientTypeCode}}; - + if (defined $nid) { - extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, - $nutrient_detail_ref); + extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrient_detail_ref) } else { $log->error("gs1_to_off - unrecognized nutrient", - {code => $results_ref->{code}, nutrient_detail_ref => $nutrient_detail_ref}) - if $log->is_error(); + { code => $results_ref->{code}, nutrient_detail_ref => $nutrient_detail_ref }) if $log->is_error(); my $map = "nutrientTypeCode"; my $source_value = $nutrient_detail_ref->{nutrientTypeCode}; defined $unknown_entries_in_gs1_maps{$map} or $unknown_entries_in_gs1_maps{$map} = {}; - defined $unknown_entries_in_gs1_maps{$map}{$source_value} - or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; + defined $unknown_entries_in_gs1_maps{$map}{$source_value} or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; $unknown_entries_in_gs1_maps{$map}{$source_value}++; } } } } } - - # If the value is a scalar, it is a target field (or multiple target fields) + + # If the value is a scalar, it is a target field (or multiple target fields) elsif (ref($source_target) eq "") { - - $log->debug( - "gs1_to_off - source field directly maps to target field", - {source_field => $source_field, target_field => $source_target} - ) if $log->is_debug(); - + + $log->debug("gs1_to_off - source field directly maps to target field", + { source_field => $source_field, target_field => $source_target }) if $log->is_debug(); + # We may have multiple source values, in an array - + my @source_values; - + if (ref($json_ref->{$source_field}) eq "ARRAY") { @source_values = @{$json_ref->{$source_field}}; } else { @source_values = ($json_ref->{$source_field}); } - + foreach my $source_value (@source_values) { - + # We may have multiple target fields, separated by commas foreach my $target_field (split(/\s*,\s*/, $source_target)) { - - $log->debug( - "gs1_to_off - assign value to target field", - { - source_field => $source_field, - source_value => $source_value, - target_field => $target_field - } - ) if $log->is_debug(); - + + $log->debug("gs1_to_off - assign value to target field", + { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_debug(); + # We might combine a value and a unit, but also keep them separate so that we can assign fields like quantity_value and quantity_unit my $source_value_value; my $source_value_unit; - + # Some fields indicate a language: - - # ingredientStatement: { - # languageCode: "fr", - # $t: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium." - # }, + + # ingredientStatement: { + # languageCode: "fr", + # $t: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium." + # }, # or another format (depending on how the XML was converted to JSON): - - # ingredientStatement: { - # #: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium.", - # @: { - # languageCode: "fr" - # } - # }, + + # ingredientStatement: { + # #: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium.", + # @: { + # languageCode: "fr" + # } + # }, if (ref($source_value) eq "HASH") { my $language_code; my $value; - + # There may be a language code if (defined $source_value->{languageCode}) { $language_code = $source_value->{languageCode}; @@ -1330,27 +1198,26 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { elsif ((defined $source_value->{'@'}) and (defined $source_value->{'@'}{languageCode})) { $language_code = $source_value->{'@'}{languageCode}; } - + # Keep track of language codes so that we can assign the lc and lang fields if (defined $language_code) { defined $results_ref->{languages} or $results_ref->{languages} = {}; - defined $results_ref->{languages}{$language_code} - or $results_ref->{languages}{$language_code} = 0; + defined $results_ref->{languages}{$language_code} or $results_ref->{languages}{$language_code} = 0; $results_ref->{languages}{$language_code}++; } - + if (defined $source_value->{'$t'}) { $value = $source_value->{'$t'}; } elsif (defined $source_value->{'#'}) { $value = $source_value->{'#'}; } - + # There may be a measurement unit code, or a time measurement unit code # in that case, concatenate it to the value - + foreach my $code ("measurementUnitCode", "timeMeasurementUnitCode") { - + if (defined $source_value->{$code}) { $source_value_value = $value; $source_value_unit = $gs1_maps{$code}{$source_value->{$code}}; @@ -1362,45 +1229,30 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { $value .= " " . $gs1_maps{$code}{$source_value->{'@'}{$code}}; } } - + # If the field is a language specific field, we can assign the value to the language specific field if ((defined $language_code) and (defined $language_fields{$target_field})) { $target_field = $target_field . "_" . lc($language_code); - $log->debug( - "gs1_to_off - changed to language specific target field", - { - source_field => $source_field, - source_value => $source_value, - target_field => $target_field - } - ) if $log->is_debug(); + $log->debug("gs1_to_off - changed to language specific target field", + { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_debug(); } - + if (defined $value) { $source_value = $value; } else { - $log->error( - "gs1_to_off - issue with source value structure", - { - source_field => $source_field, - source_value => $source_value, - target_field => $target_field - } - ) if $log->is_error(); + $log->error("gs1_to_off - issue with source value structure", + { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_error(); $source_value = undef; } } - - if ( - (defined $source_value) - and ($source_value ne "") + + if ((defined $source_value) and ($source_value ne "") # CodeOnline sometimes has empty values '.' or '0' for partyName for one of the fields brandOwner or informationProviderOfTradeItem # ignore them in order to keep the partyName value from the other fields - and not(($source_field eq "partyName") and (length($source_value) < 2)) - ) - { - + and not (($source_field eq "partyName") and (length($source_value) < 2)) + ) { + # allergenTypeCode => '+traces%allergens', # % sign means we will use a map to transform the source value if ($target_field =~ /\%/) { @@ -1410,38 +1262,28 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { $source_value = $gs1_maps{$map}{$source_value}; } else { - $log->error( - "gs1_to_off - unknown source value for map", - { - code => $results_ref->{code}, - source_field => $source_field, - source_value => $source_value, - target_field => $target_field, - map => $map - } - ) if $log->is_error(); - defined $unknown_entries_in_gs1_maps{$map} - or $unknown_entries_in_gs1_maps{$map} = {}; - defined $unknown_entries_in_gs1_maps{$map}{$source_value} - or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; + $log->error("gs1_to_off - unknown source value for map", + { code => $results_ref->{code}, source_field => $source_field, source_value => $source_value, target_field => $target_field, map => $map }) if $log->is_error(); + defined $unknown_entries_in_gs1_maps{$map} or $unknown_entries_in_gs1_maps{$map} = {}; + defined $unknown_entries_in_gs1_maps{$map}{$source_value} or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; $unknown_entries_in_gs1_maps{$map}{$source_value}++; # Skip the entry next; } - } - + } + # allergenTypeCode => '+traces%allergens', # + sign means we will create a comma separated list if we have multiple values if ($target_field =~ /^\+/) { $target_field = $'; - + if (defined $results_ref->{$target_field}) { $source_value = $results_ref->{$target_field} . ', ' . $source_value; } } - + assign_field($results_ref, $target_field, $source_value); - + if ($target_field eq "quantity") { if (defined $source_value_value) { assign_field($results_ref, $target_field . "_value", $source_value_value); @@ -1450,53 +1292,53 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { } } } - + } } - + elsif (ref($source_target) eq "ARRAY") { - - # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 - # source_field => array of hashes: go down one level, expect an array - # - # ["tradeItemContactInformation", [ - # { - # # match => hash of key value conditions: assign values to field only if the conditions match - # match => [ - # ["contactTypeCode", "CXC"], - # ], - # fields => [ - # ["contactAddress", "customer_service_fr"], - # ], - # }, - # ], - # ], - - $log->debug("gs1_to_off - array field", {source_field => $source_field}) if $log->is_debug(); - + +# http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 +# source_field => array of hashes: go down one level, expect an array +# +# ["tradeItemContactInformation", [ +# { +# # match => hash of key value conditions: assign values to field only if the conditions match +# match => [ +# ["contactTypeCode", "CXC"], +# ], +# fields => [ +# ["contactAddress", "customer_service_fr"], +# ], +# }, +# ], +# ], + + $log->debug("gs1_to_off - array field", { source_field => $source_field }) if $log->is_debug(); + # Loop through the array entries of the GS1 to OFF mapping - + foreach my $gs1_to_off_array_entry_ref (@{$source_target}) { - + # Loop through the array entries of the JSON file - + # If the source file is not an array, create it # (e.g. if only one element is there, the xml to json conversion might not create an array) if (ref($json_ref->{$source_field}) ne "ARRAY") { - $json_ref->{$source_field} = [$json_ref->{$source_field}]; + $json_ref->{$source_field} = [ $json_ref->{$source_field} ]; } - + foreach my $json_array_entry_ref (@{$json_ref->{$source_field}}) { gs1_to_off($gs1_to_off_array_entry_ref, $json_array_entry_ref, $results_ref); } } - } - + } + elsif (ref($source_target) eq "HASH") { - + # Go down one level - + # The source structure may be a hash or an array of hashes # e.g. Equadis: allergenRelatedInformation is a hash, CodeOnline: it is an array @@ -1512,14 +1354,13 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # { # allergenTypeCode: "AE", # levelOfContainmentCode: "CONTAINS" - # }, + # }, $log->debug("gs1_to_off - source_target is a hash", - {source_field => $source_field, source_target => $source_target, json_ref => $json_ref}) - if $log->is_debug(); - + { source_field => $source_field, source_target => $source_target, json_ref => $json_ref }) if $log->is_debug(); + if (ref($json_ref->{$source_field}) eq "HASH") { - + gs1_to_off($source_target, $json_ref->{$source_field}, $results_ref); } elsif (ref($json_ref->{$source_field}) eq "ARRAY") { @@ -1530,20 +1371,13 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # allergenRelatedInformation: [ # [ ] # ] - + if (ref($json_array_entry_ref) eq "HASH") { gs1_to_off($source_target, $json_array_entry_ref, $results_ref); } else { - $log->debug( - "gs1_to_off - expected a hash but got an array", - { - source_field => $source_field, - source_target => $source_target, - json_ref => $json_ref, - json_array_entry_ref => $json_array_entry_ref - } - ) if $log->is_debug(); + $log->debug("gs1_to_off - expected a hash but got an array", + { source_field => $source_field, source_target => $source_target, json_ref => $json_ref, json_array_entry_ref => $json_array_entry_ref }) if $log->is_debug(); } } } @@ -1553,6 +1387,7 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { return; } + =head2 convert_single_text_property_to_direct_value ($json ) There are different ways to convert a XML document to a JSON data structure. @@ -1586,18 +1421,16 @@ gtin: "03449865355608" =cut -sub convert_single_text_property_to_direct_value ($json_ref) { - my $type = ref $json_ref or return; +sub convert_single_text_property_to_direct_value($json_ref) { + + my $type = ref $json_ref or return; - if ($type eq 'HASH') { + if ($type eq 'HASH') { foreach my $key (keys %$json_ref) { if (ref $json_ref->{$key}) { # Hash with a single $t value? - if ( (ref $json_ref->{$key} eq 'HASH') - and ((scalar keys %{$json_ref->{$key}}) == 1) - and (defined $json_ref->{$key}{'$t'})) - { + if ((ref $json_ref->{$key} eq 'HASH') and ((scalar keys %{$json_ref->{$key}}) == 1) and (defined $json_ref->{$key}{'$t'})) { $json_ref->{$key} = $json_ref->{$key}{'$t'}; } else { @@ -1605,18 +1438,19 @@ sub convert_single_text_property_to_direct_value ($json_ref) { } } } - } - elsif ($type eq 'ARRAY') { - - foreach my $elem (@$json_ref) { - if (ref $elem) { - convert_single_text_property_to_direct_value($elem); - } - } - } - return; + } + elsif ($type eq 'ARRAY') { + + foreach my $elem (@$json_ref) { + if (ref $elem) { + convert_single_text_property_to_direct_value($elem); + } + } + } + return; } + =head2 convert_gs1_json_message_to_off_products_csv_fields ($json, $products_ref, $messages_ref) Thus function converts the data for one or more products in the GS1 format converted to JSON. @@ -1644,8 +1478,9 @@ Each message will be added as one element (a hash ref) of the messages data arra =cut -sub convert_gs1_json_message_to_off_products_csv ($json_ref, $products_ref, $messages_ref) { +sub convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $messages_ref) { + # Depending on how the original XML was converted to JSON, # text values of XML tags can be assigned directly as the value of the corresponding key # or they can be stored inside a hash with the $t key @@ -1653,10 +1488,10 @@ sub convert_gs1_json_message_to_off_products_csv ($json_ref, $products_ref, $mes # levelOfContainmentCode: { # $t: "MAY_CONTAIN" # }, - + # The JSON can contain only the product information "tradeItem" level # or the tradeItem can be encapsulated in a message - + # catalogue_item_notification:catalogueItemNotificationMessage # - transaction # -- documentCommand @@ -1670,69 +1505,62 @@ sub convert_gs1_json_message_to_off_products_csv ($json_ref, $products_ref, $mes my $message_ref = {}; gs1_to_off(\%gs1_message_to_off, $json_ref, $message_ref); push @$messages_ref, $message_ref; - $log->debug("convert_gs1_json_to_off_csv - GS1 message fields", {message_ref => $message_ref}) - if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - GS1 message fields", { message_ref => $message_ref }) if $log->is_debug(); } - - foreach my $field ( - qw( + + foreach my $field (qw( catalogue_item_notification:catalogueItemNotificationMessage transaction documentCommand catalogue_item_notification:catalogueItemNotification catalogueItem - ) - ) - { + )) { if (defined $json_ref->{$field}) { $json_ref = $json_ref->{$field}; - $log->debug("convert_gs1_json_to_off_csv - remove encapsulating field", {field => $field}) - if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - remove encapsulating field", { field => $field }) if $log->is_debug(); } } # A product can contain a child product my $child_product_json_ref = deep_get($json_ref, qw(catalogueItemChildItemLink catalogueItem)); if (defined $child_product_json_ref) { - $log->debug("convert_gs1_json_to_off_csv - found a child item", {}) if $log->is_debug(); - convert_gs1_json_message_to_off_products_csv($child_product_json_ref, $products_ref, $messages_ref); + $log->debug("convert_gs1_json_to_off_csv - found a child item", { }) if $log->is_debug(); + convert_gs1_json_message_to_off_products_csv($child_product_json_ref, $products_ref, $messages_ref) } if (defined $json_ref->{tradeItem}) { $json_ref = $json_ref->{tradeItem}; } - + if (not defined $json_ref->{gtin}) { - $log->debug("convert_gs1_json_to_off_csv - no gtin - skipping", {json_ref => $json_ref}) if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - no gtin - skipping", { json_ref => $json_ref }) if $log->is_debug(); return {}; } - + if ((not defined $json_ref->{isTradeItemAConsumerUnit}) or ($json_ref->{isTradeItemAConsumerUnit} ne "true")) { - $log->debug( - "convert_gs1_json_to_off_csv - isTradeItemAConsumerUnit not true - skipping", - {isTradeItemAConsumerUnit => $json_ref->{isTradeItemAConsumerUnit}} - ) if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - isTradeItemAConsumerUnit not true - skipping", + { isTradeItemAConsumerUnit => $json_ref->{isTradeItemAConsumerUnit} }) if $log->is_debug(); return {}; } - + my $product_ref = {}; - + gs1_to_off(\%gs1_product_to_off, $json_ref, $product_ref); - + # assign the lang and lc fields if (defined $product_ref->{languages}) { - my @sorted_languages = sort({$product_ref->{languages}{$b} <=> $product_ref->{languages}{$a}} - keys %{$product_ref->{languages}}); + my @sorted_languages = sort ( { $product_ref->{languages}{$b} <=> $product_ref->{languages}{$a} } keys %{$product_ref->{languages}}); my $top_language = $sorted_languages[0]; $product_ref->{lc} = $top_language; $product_ref->{lang} = $top_language; delete $product_ref->{languages}; } - + push @$products_ref, $product_ref; return; } + =head2 read_gs1_json_file ($json_file, $products_ref, $messages_ref) Read a GS1 message file in json format, convert the included products in the OFF format, @@ -1751,12 +1579,12 @@ The encapsulating GS1 message is added to the $messages_ref array =cut -sub read_gs1_json_file ($json_file, $products_ref, $messages_ref) { - - $log->debug("read_gs1_json_file", {json_file => $json_file}) if $log->is_debug(); +sub read_gs1_json_file($json_file, $products_ref, $messages_ref) { - open(my $in, "<", $json_file) or die("Cannot open json file $json_file : $!\n"); - my $json = join(q{}, (<$in>)); + $log->debug("read_gs1_json_file", { json_file => $json_file }) if $log->is_debug(); + + open (my $in, "<", $json_file) or die("Cannot open json file $json_file : $!\n"); + my $json = join (q{}, (<$in>)); close($in); my $json_ref = decode_json($json); @@ -1765,20 +1593,22 @@ sub read_gs1_json_file ($json_file, $products_ref, $messages_ref) { # to the format generated by the nodejs xml2json module # which is the expected format of the ProductOpener::GS1 module convert_single_text_property_to_direct_value($json_ref); - + convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $messages_ref); return; } + sub generate_gs1_message_identifier() { # local GLN + 60 random hexadecimal characters my $identifier = deep_get(\%options, qw(gs1 local_gln)) . "_"; - $identifier .= sprintf("%x", rand 16) for 1 .. 60; + $identifier .= sprintf("%x", rand 16) for 1..60; return $identifier; } + =head2 generate_gs1_confirmation_message ($notification_message_ref, $timestamp) GS1 data pools (catalogs) send us GSDN Catalogue Item Notification (CIN) which are messages @@ -1803,7 +1633,7 @@ generate test confirmation messages which don't have a different content every t =cut -sub generate_gs1_confirmation_message ($notification_message_ref, $timestamp) { +sub generate_gs1_confirmation_message($notification_message_ref, $timestamp) { # We will need to generate a message identifier, put it in the XML content, # and return it as it is used as the file name @@ -1826,19 +1656,19 @@ sub generate_gs1_confirmation_message ($notification_message_ref, $timestamp) { # Include the notification data in the template data for the confirmation $confirmation_data_ref->{notification} = $notification_message_ref; + my $xml; if (process_template('gs1/catalogue_item_confirmation.tt.xml', $confirmation_data_ref, \$xml)) { - $log->debug("generate_gs1_confirmation_message - success", - {confirmation_instance_identifier => $confirmation_instance_identifier}) - if $log->is_error(); + $log->debug("generate_gs1_confirmation_message - success", { confirmation_instance_identifier => $confirmation_instance_identifier}) if $log->is_error(); } else { - $log->error("generate_gs1_confirmation_message - template error", {error => $tt->error()}) if $log->is_error(); + $log->error("generate_gs1_confirmation_message - template error", { error => $tt->error() }) if $log->is_error(); } return ($confirmation_instance_identifier, $xml); } + =head2 write_off_csv_file ($csv_file, $products_ref) Write all product data from the $products_ref array to a CSV file in OFF format. @@ -1851,38 +1681,34 @@ Write all product data from the $products_ref array to a CSV file in OFF format. =cut -sub write_off_csv_file ($csv_file, $products_ref) { - - $log->debug("write_off_csv_file", {csv_file => $csv_file}) if $log->is_debug(); +sub write_off_csv_file($csv_file, $products_ref) { + $log->debug("write_off_csv_file", { csv_file => $csv_file }) if $log->is_debug(); + open(my $filehandle, ">:encoding(UTF-8)", $csv_file) or die("Cannot write csv file $csv_file : $!\n"); - + my $separator = "\t"; - - my $csv = Text::CSV->new({binary => 1, sep_char => $separator}) # should set binary attribute. - or die "Cannot use CSV: " . Text::CSV->error_diag(); + + my $csv = Text::CSV->new ( { binary => 1 , sep_char => $separator } ) # should set binary attribute. + or die "Cannot use CSV: ".Text::CSV->error_diag (); # Print the header line with fields names - - $log->debug("write_off_csv_file - header", {csv_fields => \@csv_fields}) if $log->is_debug(); - - $csv->print($filehandle, \@csv_fields); + + $log->debug("write_off_csv_file - header", { csv_fields => \@csv_fields }) if $log->is_debug(); + + $csv->print ($filehandle, \@csv_fields); print $filehandle "\n"; - + # We may have the same product multiple times, sort by sources_fields:org-gs1:publicationDateTime # or by lastChangeDateTime (publicationDateTime is not in the CodeOnline export) my %seen_products = (); - + foreach my $product_ref ( - sort { - ($b->{"sources_fields:org-gs1:publicationDateTime"} // $b->{"sources_fields:org-gs1:lastChangeDateTime"}) - cmp($a->{"sources_fields:org-gs1:publicationDateTime"} - // $a->{"sources_fields:org-gs1:lastChangeDateTime"}) - } @$products_ref - ) - { - - $log->debug("write_off_csv_file - product", {code => $product_ref->{code}}) if $log->is_debug(); + sort {($b->{"sources_fields:org-gs1:publicationDateTime"} // $b->{"sources_fields:org-gs1:lastChangeDateTime"}) + cmp ($a->{"sources_fields:org-gs1:publicationDateTime"} // $a->{"sources_fields:org-gs1:lastChangeDateTime"}) } + @$products_ref) { + + $log->debug("write_off_csv_file - product", { code => $product_ref->{code} }) if $log->is_debug(); if (defined $seen_products{$product_ref->{code}}) { # Skip product for which we have a more recent publication next; @@ -1890,20 +1716,21 @@ sub write_off_csv_file ($csv_file, $products_ref) { else { $seen_products{$product_ref->{code}} = 1; } - + my @csv_fields_values = (); foreach my $field (@csv_fields) { push @csv_fields_values, $product_ref->{$field}; } - - $csv->print($filehandle, \@csv_fields_values); + + $csv->print ($filehandle, \@csv_fields_values); print $filehandle "\n"; } - + close $filehandle; return; } + =head2 print_unknown_entries_in_gs1_maps () Prints the entries for GS1 data types for which we do not have a corresponding OFF match, @@ -1912,24 +1739,22 @@ ordered by the number of occurrences in the GS1 data =cut sub print_unknown_entries_in_gs1_maps() { - + my $unknown_entries = 0; - + foreach my $map (sort keys %unknown_entries_in_gs1_maps) { print "$map map has unknown entries:\n"; - - foreach my $source_value ( - sort {$unknown_entries_in_gs1_maps{$map}{$a} <=> $unknown_entries_in_gs1_maps{$map}{$b}} - keys %{$unknown_entries_in_gs1_maps{$map}} - ) - { + + foreach my $source_value + (sort { $unknown_entries_in_gs1_maps{$map}{$a} <=> $unknown_entries_in_gs1_maps{$map}{$b} } + keys %{$unknown_entries_in_gs1_maps{$map}}) { print $source_value . "\t" . $unknown_entries_in_gs1_maps{$map}{$source_value} . "\n"; $unknown_entries++; } - + print "\n"; } - + return $unknown_entries; } From d497ec043f8cf908c503d915a7a03625f49e00c2 Mon Sep 17 00:00:00 2001 From: off Date: Mon, 12 Dec 2022 15:47:23 +0100 Subject: [PATCH 3/8] scripts for Bayard --- scripts/bayard-import/README.md | 32 ++++++++++++++++++++++ scripts/bayard-import/bayard-xml2json.js | 32 ++++++++++++++++++++++ scripts/bayard-import/find_and_copy.sh | 1 + scripts/bayard-import/run_bayard_import.sh | 26 ++++++++++++++++++ 4 files changed, 91 insertions(+) create mode 100644 scripts/bayard-import/README.md create mode 100644 scripts/bayard-import/bayard-xml2json.js create mode 100644 scripts/bayard-import/find_and_copy.sh create mode 100755 scripts/bayard-import/run_bayard_import.sh diff --git a/scripts/bayard-import/README.md b/scripts/bayard-import/README.md new file mode 100644 index 0000000000000..0b46333f53dc0 --- /dev/null +++ b/scripts/bayard-import/README.md @@ -0,0 +1,32 @@ + +# Requirements +- node (https://nodejs.org/) +- miller >=5.0 (https://johnkerl.org/miller/doc/) + +On stretch: +apt-get -t stretch-backports install miller + +# Usage +Install the xml2csv node module found here: https://github.com/odtvince/xml2csv + +As the off user: + +``` +mkdir ~/npm-global +export NPM_CONFIG_PREFIX=~/.npm-global +/srv/off-pro/scripts/equadis-import# git clone https://github.com/odtvince/xml2csv.git +/srv/off-pro/scripts/equadis-import# cd xml2csv/ +/srv/off-pro/scripts/equadis-import/xml2csv# npm link +/srv/off-pro/scripts/equadis-import/xml2csv# cd .. +/srv/off-pro/scripts/equadis-import# npm link xml2csv +``` + +Put all the xml files to import into the `equadis-data` directory + +Then execute: +``` +node equadis-xml2csv.js +./equadis2off.sh > equadis-data.tsv +./dereference.sh equadis-data.tsv +``` + diff --git a/scripts/bayard-import/bayard-xml2json.js b/scripts/bayard-import/bayard-xml2json.js new file mode 100644 index 0000000000000..b381ae4275804 --- /dev/null +++ b/scripts/bayard-import/bayard-xml2json.js @@ -0,0 +1,32 @@ +// This script is used to convert GDSN data from Equadis in XML format +// to a corresponding JSON structure + +const xml2json = require('xml2json') +const fs = require("fs") + +const directoryPath = "/srv2/off-pro/bayard-data-tmp/" + +const filter = /\.xml$/ + +// force arrays for some fields even if there is only one value supplied +const options = { + arrayNotation: ['nutrientHeader', 'allergen', 'packagingMarkedLabelAccreditationCode'] +}; + +fs.readdir(directoryPath, function(err, files) { + if (err) { + console.log("Error getting directory information.") + } else { + files.forEach(function(file) { + + if (filter.test(file)) { + + let content = fs.readFileSync(directoryPath+file, 'utf8'); + let json = xml2json.toJson(content, options); + fs.writeFileSync(directoryPath+file.replace('.xml','.json'), json); + } + + }) + } +}) + diff --git a/scripts/bayard-import/find_and_copy.sh b/scripts/bayard-import/find_and_copy.sh new file mode 100644 index 0000000000000..975639f195bbd --- /dev/null +++ b/scripts/bayard-import/find_and_copy.sh @@ -0,0 +1 @@ +find /home/sftp/equadis/data/ -mtime -5 -type f -exec grep -q 'NATURENVIE' {} \; -exec cp {} /srv2/off-pro/equadis-data-tmp/ \; diff --git a/scripts/bayard-import/run_bayard_import.sh b/scripts/bayard-import/run_bayard_import.sh new file mode 100755 index 0000000000000..28a0e4c8b7bec --- /dev/null +++ b/scripts/bayard-import/run_bayard_import.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +cd /srv/off-pro/scripts + +# copy files modified in the last few days + +rm -rf /srv2/off-pro/bayard-data-tmp +mkdir /srv2/off-pro/bayard-data-tmp +find /home/sftp/bayard/data/ -mtime -2 -type f -exec cp {} /srv2/off-pro/bayard-data-tmp/ \; + +# turn Bayard xml files into JSON file + +export NPM_CONFIG_PREFIX=~/.npm-global + +node /srv/off-pro/scripts/convert_gs1_xml_to_json_in_dir.js /srv2/off-pro/bayard-data-tmp/ + +# convert JSON files to a single CSV file + +export PERL5LIB=. + +/srv/off-pro/scripts/convert_gs1_json_to_off_csv.pl --input-dir /srv2/off-pro/bayard-data-tmp --output /srv2/off-pro/bayard-data-tmp/bayard-data.tsv + +# import CSV file + +export PERL5LIB="/srv/off-pro/lib:${PERL5LIB}" +/srv/off-pro/scripts/import_csv_file.pl --user_id bayard --org_id bayard --source_id bayard --source_name Bayard --source_url https://bayard.com/ --manufacturer 1 --comment "Import from Bayard" --define lc=de --images_download_dir /srv2/off-pro/bayard-images-tmp --csv_file /srv2/off-pro/bayard-data-tmp/bayard-data.tsv From b9808a48b1fb792906d3d10c5c5fe5335e59a724 Mon Sep 17 00:00:00 2001 From: off Date: Fri, 3 Feb 2023 17:21:37 +0100 Subject: [PATCH 4/8] changes for bayard import --- lib/ProductOpener/GS1.pm | 29 +- lib/ProductOpener/ImportConvert.pm | 851 +++++++++++++---------------- 2 files changed, 392 insertions(+), 488 deletions(-) diff --git a/lib/ProductOpener/GS1.pm b/lib/ProductOpener/GS1.pm index 4c405ea8fd31e..d562ab54ef55e 100644 --- a/lib/ProductOpener/GS1.pm +++ b/lib/ProductOpener/GS1.pm @@ -136,6 +136,7 @@ my %unknown_entries_in_gs1_maps = (); # Shellfish could be Molluscs or Crustaceans # "UN" => "Shellfish", "UW" => "Wheat", + "X99" => "None", }, measurementUnitCode => { @@ -234,29 +235,8 @@ my %unknown_entries_in_gs1_maps = (); # skipped X_ entries such as X_ACAI_BERRY_EXTRACT "ZN" => "zinc", }, - + packagingTypeCode => { - "AE" => "Aérosol", - "BA" => "Tonneau", - "BG" => "Sac", - "BK" => "Barquette", - "BO" => "Bouteille", - "BPG" => "Blister", - "BRI" => "Brique", - "BX" => "Boite", - "CNG" => "Canette", - "CR" => "Caisse", - "CT" => "Conteneur", - "CU" => "Pot", - "EN" => "Enveloppe", - "JR" => "Bocal", - "PO" => "Poche", - "PUG" => "Sac de transport", - "TU" => "Tube", - "WRP" => "Film", - }, - - packagingTypeCode_unused_not_taxonomized_yet => { "AE" => "en:aerosol", "BA" => "en:barrel", "BG" => "en:bag", @@ -333,8 +313,13 @@ my %unknown_entries_in_gs1_maps = (); # https://gs1.se/en/guides/documentation/code-lists/t3783-target-market-country-code/ targetMarketCountryCode => { + "040" => "en:austria", + "056" => "en:belgium", "250" => "en:france", "276" => "en:germany", + "380" => "en:italy", + "724" => "en:spain", + "756" => "en:switzerland", }, timeMeasurementUnitCode => { diff --git a/lib/ProductOpener/ImportConvert.pm b/lib/ProductOpener/ImportConvert.pm index 43cdfa1a905c6..9df089ae1b62d 100644 --- a/lib/ProductOpener/ImportConvert.pm +++ b/lib/ProductOpener/ImportConvert.pm @@ -44,15 +44,16 @@ convert the product data they contain to a format that can be imported on Open F package ProductOpener::ImportConvert; use ProductOpener::PerlStandards; -use Exporter qw< import >; +use Exporter qw< import >; use Log::Any qw($log); use Storable qw(dclone); use Text::Fuzzy; -BEGIN { - use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); +BEGIN +{ + use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); @EXPORT_OK = qw( %fields @@ -90,11 +91,11 @@ BEGIN { @xml_errors - ); # symbols to export on request + ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } -use vars @EXPORT_OK; +use vars @EXPORT_OK ; use ProductOpener::Config qw/:all/; use ProductOpener::Store qw/:all/; @@ -125,7 +126,7 @@ my $mode = "append"; =cut -sub get_or_create_product_for_code ($code) { +sub get_or_create_product_for_code($code) { if (not defined $code) { die("Undefined code $code"); @@ -145,7 +146,7 @@ sub get_or_create_product_for_code ($code) { return $products{$code}; } -sub assign_value ($product_ref, $target, $value) { +sub assign_value($product_ref, $target, $value) { my $field = $target; @@ -176,11 +177,8 @@ sub assign_value ($product_ref, $target, $value) { $value =~ s/(\.|\,)(\d*[1-9])0+$/$1$2/; } - if ( (defined $product_ref->{$field}) - and ($product_ref->{$field} ne "") - and ($mode eq "append") - and ($product_ref->{$field} ne $value)) - { + if ((defined $product_ref->{$field}) and ($product_ref->{$field} ne "") and ($mode eq "append") + and ($product_ref->{$field} ne $value)) { if (exists $tags_fields{$field}) { if ($target =~ /^!/) { @@ -210,7 +208,8 @@ sub assign_value ($product_ref, $target, $value) { return; } -sub remove_value ($product_ref, $target, $value) { + +sub remove_value($product_ref, $target, $value) { my $field = $target; @@ -221,10 +220,12 @@ sub remove_value ($product_ref, $target, $value) { return; } -sub apply_global_params ($product_ref) { + +sub apply_global_params($product_ref) { $mode = "append"; + foreach my $field (sort keys %global_params) { assign_value($product_ref, $field, $global_params{$field}); @@ -235,15 +236,13 @@ sub apply_global_params ($product_ref) { # some producers send us data for products in different languages sold in different markets -sub assign_main_language_of_product ($product_ref, $lcs_ref, $default_lc) { +sub assign_main_language_of_product($product_ref, $lcs_ref, $default_lc) { if ((not defined $product_ref->{lc}) or (not defined $product_ref->{"product_name_" . $product_ref->{lc}})) { foreach my $possible_lc (@{$lcs_ref}) { - if ( (defined $product_ref->{"product_name_" . $possible_lc}) - and ($product_ref->{"product_name_" . $possible_lc} !~ /^\s*$/)) - { - $log->info("assign_main_language_of_product: assigning value", {lc => $possible_lc}) if $log->is_info(); + if ((defined $product_ref->{"product_name_" . $possible_lc}) and ($product_ref->{"product_name_" . $possible_lc} !~ /^\s*$/)) { + $log->info("assign_main_language_of_product: assigning value", { lc => $possible_lc}) if $log->is_info(); assign_value($product_ref, "lc", $possible_lc); last; } @@ -251,37 +250,34 @@ sub assign_main_language_of_product ($product_ref, $lcs_ref, $default_lc) { } if (not defined $product_ref->{lc}) { - $log->info("assign_main_language_of_product: assigning default value", {lc => $default_lc}) if $log->is_info(); + $log->info("assign_main_language_of_product: assigning default value", { lc => $default_lc}) if $log->is_info(); assign_value($product_ref, "lc", $default_lc); } return; } -sub assign_countries_for_product ($product_ref, $lcs_ref, $default_country) { +sub assign_countries_for_product($product_ref, $lcs_ref, $default_country) { foreach my $possible_lc (sort keys %{$lcs_ref}) { if (defined $product_ref->{"product_name_" . $possible_lc}) { - assign_value($product_ref, "countries", $lcs_ref->{$possible_lc}); - $log->info( - "assign_countries_for_product: found lc - assigning value", - {lc => $possible_lc, countries => $lcs_ref->{$possible_lc}} - ) if $log->is_info(); + assign_value($product_ref,"countries", $lcs_ref->{$possible_lc}); + $log->info("assign_countries_for_product: found lc - assigning value", { lc => $possible_lc, countries => $lcs_ref->{$possible_lc}}) if $log->is_info(); } } if ((not defined $product_ref->{countries}) or ($product_ref->{countries} eq "")) { - assign_value($product_ref, "countries", $default_country); - $log->info("assign_countries_for_product: assigning default value", {countries => $default_country}) - if $log->is_info(); + assign_value($product_ref,"countries", $default_country); + $log->info("assign_countries_for_product: assigning default value", { countries => $default_country}) if $log->is_info(); } return; } + # Match all tags that exist in a taxonomy. Needs the input field to be split, so there must be separators. -sub match_taxonomy_tags ($product_ref, $source, $target, $options_ref) { +sub match_taxonomy_tags($product_ref, $source, $target, $options_ref) { # logo ab # logo bio européen : nl-bio-01 agriculture pays bas 1 @@ -296,9 +292,7 @@ sub match_taxonomy_tags ($product_ref, $source, $target, $options_ref) { if ((defined $product_ref->{$source}) and ($product_ref->{$source} ne "")) { - $log->trace("match_taxonomy_tags: init", - {source => $source, value => $product_ref->{$source}, target => $target}) - if $log->is_trace(); + $log->trace("match_taxonomy_tags: init", { source => $source, value => $product_ref->{$source}, target => $target}) if $log->is_trace(); my @values = ($product_ref->{$source}); if ((defined $options_ref) and (defined $options_ref->{split}) and ($options_ref->{split} ne "")) { @@ -318,26 +312,20 @@ sub match_taxonomy_tags ($product_ref, $source, $target, $options_ref) { $value =~ s/\s+$//; my $canon_tag = canonicalize_taxonomy_tag($product_ref->{lc}, $target, $value); - $log->trace("match_taxonomy_tags: split value", {value => $value, canon_tag => $canon_tag}) - if $log->is_trace(); + $log->trace("match_taxonomy_tags: split value", { value => $value, canon_tag => $canon_tag}) if $log->is_trace(); + if (exists_taxonomy_tag($target, $canon_tag)) { assign_value($product_ref, $target, $canon_tag); - $log->info("match_taxonomy_tags: assigning value", - {source => $source, value => $canon_tag, target => $target}) - if $log->is_info(); + $log->info("match_taxonomy_tags: assigning value", { source => $source, value => $canon_tag, target => $target}) if $log->is_info(); } # try to see if we have a packager code # e.g. from Carrefour: Fabriqué en France par EMB 29181 (F) ou EMB 86092A (G) pour Interdis. elsif (($value =~ /^((e|emb)(\s|-|\.)*(\d{5})(\s|-|\.)*(\w)?)$/i) - or ($value =~ /([a-z][a-z])(\s|\.|-)+\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+(ce|ec|eg)/i)) - { - assign_value($product_ref, "emb_codes", $value); - $log->info( - "match_taxonomy_tags: found packaging code - assigning value", - {source => $source, value => $value, target => "emb_codes"} - ) if $log->is_info(); + or ($value =~ /([a-z][a-z])(\s|\.|-)+\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+(ce|ec|eg)/i)) { + assign_value($product_ref,"emb_codes", $value); + $log->info("match_taxonomy_tags: found packaging code - assigning value", { source => $source, value => $value, target => "emb_codes"}) if $log->is_info(); } } } @@ -345,28 +333,20 @@ sub match_taxonomy_tags ($product_ref, $source, $target, $options_ref) { return; } + # Match only specific tags (e.g. "organic" + "label rouge" in product name) -sub match_specific_taxonomy_tags ($product_ref, $source, $target, $tags_ref) { +sub match_specific_taxonomy_tags($product_ref, $source, $target, $tags_ref) { my $tag_lc = $product_ref->{lc}; - $log->trace( - "match_specific_taxonomy_tags - start", - { - source => $source, - source_value => $product_ref->{$source}, - target => $target, - tag_lc => $tag_lc, - tags_ref => $tags_ref - } - ) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - start", { source => $source, source_value => $product_ref->{$source}, target => $target, tag_lc => $tag_lc, tags_ref => $tags_ref}) if $log->is_trace(); if ((defined $product_ref->{$source}) and ($product_ref->{$source} ne "")) { foreach my $tagid (@{$tags_ref}) { - $log->trace("match_specific_taxonomy_tags - looping through tags", {tagid => $tagid}) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - looping through tags", { tagid => $tagid}) if $log->is_trace(); if (defined $translations_to{$target}{$tagid}{$tag_lc}) { @@ -381,11 +361,11 @@ sub match_specific_taxonomy_tags ($product_ref, $source, $target, $tags_ref) { } my $tag_regexp = ""; - foreach my $synonym (sort {length($b) <=> length($a)} @synonyms) { + foreach my $synonym (sort { length($b) <=> length($a) } @synonyms) { # simple singulars and plurals my $singular = $synonym; $synonym =~ s/s$//; - $tag_regexp .= '|' . $synonym . '|' . $synonym . 's'; + $tag_regexp .= '|' . $synonym . '|' . $synonym . 's' ; my $unaccented_synonym = unac_string_perl($synonym); if ($unaccented_synonym ne $synonym) { @@ -395,18 +375,16 @@ sub match_specific_taxonomy_tags ($product_ref, $source, $target, $tags_ref) { } $tag_regexp =~ s/^\|//; - $log->trace("match_specific_taxonomy_tags - regexp", {tag_regexp => $tag_regexp}) if $log->is_trace(); - $log->trace("match_specific_taxonomy_tags - source value", {source_value => $product_ref->{$source}}) - if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - regexp", { tag_regexp => $tag_regexp}) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - source value", { source_value => $product_ref->{$source}}) if $log->is_trace(); if ($product_ref->{$source} =~ /\b(${tag_regexp})\b/i) { $log->info( "match_specific_taxonomy_tags: assigning value", - { - matching => $1, - source => $source, - value => $tagid, - target => $target + { matching => $1, + source => $source, + value => $tagid, + target => $target } ) if $log->is_info(); assign_value($product_ref, $target, $tagid); @@ -418,7 +396,7 @@ sub match_specific_taxonomy_tags ($product_ref, $source, $target, $tags_ref) { return; } -sub match_labels_in_product_name ($product_ref) { +sub match_labels_in_product_name($product_ref) { my $tag_lc = $product_ref->{lc}; @@ -434,12 +412,12 @@ sub match_labels_in_product_name ($product_ref) { return; } -sub split_allergens ($allergens) { + +sub split_allergens($allergens) { # simple allergen (not an enumeration) -> return _$allergens_ - if ( ($allergens !~ /,/) - and (not($allergens =~ / (et|and) /i))) - { + if (($allergens !~ /,/) + and (not ($allergens =~ / (et|and) /i))) { return "_" . $allergens . "_"; } else { @@ -447,6 +425,8 @@ sub split_allergens ($allergens) { } } + + =head2 assign_quantity_from_field ( $product_ref, $field ) Look for a quantity in a field like a product name. @@ -454,15 +434,11 @@ Assign it to the quantity and remove it from the field. =cut -sub assign_quantity_from_field ($product_ref, $field) { +sub assign_quantity_from_field($product_ref, $field) { - if ( (defined $product_ref->{$field}) - and ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq ""))) - { + if ((defined $product_ref->{$field}) and ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq ""))) { - if ($product_ref->{$field} - =~ /\b\(?((\d+)\s?x\s?)?(\d+\.?\,?\d*)\s?(g|gr|kg|kgr|l|cl|ml|dl)\s?(x\s?(\d+))?\)?\s*$/i) - { + if ($product_ref->{$field} =~ /\b\(?((\d+)\s?x\s?)?(\d+\.?\,?\d*)\s?(g|gr|kg|kgr|l|cl|ml|dl)\s?(x\s?(\d+))?\)?\s*$/i) { my $before = $`; @@ -470,7 +446,9 @@ sub assign_quantity_from_field ($product_ref, $field) { # e.g. Barres de Céréales (8+4) x 25g # if we have a single x or a * before, skip - if (not(($before =~ /(\sx|\*)\s*$/i))) { + if (not ( + ($before =~ /(\sx|\*)\s*$/i) + )) { $product_ref->{$field} = $before; @@ -493,6 +471,7 @@ sub assign_quantity_from_field ($product_ref, $field) { return; } + =head2 remove_quantity_from_field ( $product_ref, $field ) Look for the quantity in a field like a product name. @@ -500,14 +479,14 @@ If found, remove it from the field. =cut -sub remove_quantity_from_field ($product_ref, $field) { +sub remove_quantity_from_field($product_ref, $field) { if (defined $product_ref->{$field}) { - + my $quantity = $product_ref->{quantity}; my $quantity_value = $product_ref->{quantity_value}; my $quantity_unit = $product_ref->{quantity_unit}; - + if (defined $quantity) { $quantity =~ s/\(/\\\(/g; $quantity =~ s/\)/\\\)/g; @@ -517,23 +496,18 @@ sub remove_quantity_from_field ($product_ref, $field) { $product_ref->{$field} = $`; } } - elsif ( (defined $quantity_value) - and (defined $quantity_unit) - and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value $quantity_unit\)?\s*$/i)) - { + elsif ((defined $quantity_value) and (defined $quantity_unit) and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value $quantity_unit\)?\s*$/i)) { $product_ref->{$field} = $`; } - elsif ( (defined $quantity_value) - and (defined $quantity_unit) - and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value$quantity_unit\)?\s*$/i)) - { + elsif ((defined $quantity_value) and (defined $quantity_unit) and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value$quantity_unit\)?\s*$/i)) { $product_ref->{$field} = $`; - } + } } return; } -sub clean_weights ($product_ref) { + +sub clean_weights($product_ref) { # normalize weights @@ -559,9 +533,8 @@ sub clean_weights ($product_ref) { # we can be passed values in a specific unit (e.g. quantity_in_mg) if (not defined $product_ref->{$field}) { foreach my $u ('kg', 'g', 'mg', 'mcg', 'l', 'dl', 'cl', 'ml') { - if ( (defined $product_ref->{$field . "_value_in_" . $u}) - and ($product_ref->{$field . "_value_in_" . $u} ne "")) - { + if ((defined $product_ref->{$field . "_value_in_" . $u}) + and ($product_ref->{$field . "_value_in_" . $u} ne "")) { assign_value($product_ref, $field . "_value", $product_ref->{$field . "_value_in_" . $u}); assign_value($product_ref, $field . "_unit", $u); last; @@ -570,24 +543,20 @@ sub clean_weights ($product_ref) { } # if we have a value but no unit, assume the unit is grams for weights, if the value is greater than 20 and less than 5000 - if ( - (defined $product_ref->{$field . "_value"}) + if ((defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") - and ( (not defined $product_ref->{$field . "_unit"}) + and ((not defined $product_ref->{$field . "_unit"}) or ($product_ref->{$field . "_unit"} eq "")) and ($product_ref->{$field . "_value"} > 20) and ($product_ref->{$field . "_value"} < 2000) - and ($field =~ /weight/) - ) - { + and ($field =~ /weight/)) { assign_value($product_ref, $field . "_unit", "g"); } # We may be passed quantity_value_unit, in that case assign it to quantity - if ( (not defined $product_ref->{$field}) + if ((not defined $product_ref->{$field}) and (defined $product_ref->{$field . "_value_unit"}) - and ($product_ref->{$field . "_value_unit"} ne "")) - { + and ($product_ref->{$field . "_value_unit"} ne "")) { assign_value($product_ref, $field, $product_ref->{$field . "_value_unit"}); } @@ -597,38 +566,22 @@ sub clean_weights ($product_ref) { # - a value and a unit ("30 g") # in this case, we can combine them: "2 biscuits (30 g)" - if ( - (($field eq "quantity") or ($field eq "serving_size")) - and (defined $product_ref->{$field}) - and ($product_ref->{$field} ne "") - and (defined $product_ref->{$field . "_value"}) - and ($product_ref->{$field . "_value"} ne "") + if ((($field eq "quantity") or ($field eq "serving_size")) + and (defined $product_ref->{$field}) and ($product_ref->{$field} ne "") + and (defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") and (defined $product_ref->{$field . "_unit"}) # check we have not already combined the value and unit - and ( - not( - index($product_ref->{$field}, - $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}) >= 0 - ) - ) - ) - { - - assign_value($product_ref, $field, - $product_ref->{$field} . " (" - . $product_ref->{$field . "_value"} . " " - . $product_ref->{$field . "_unit"} - . ")"); + and (not (index($product_ref->{$field}, $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}) >= 0)) ) { + + assign_value($product_ref, $field, $product_ref->{$field} . " (" . $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"} . ")" ); } - elsif ( (not defined $product_ref->{$field}) + elsif ((not defined $product_ref->{$field}) and (defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") - and (defined $product_ref->{$field . "_unit"})) - { + and (defined $product_ref->{$field . "_unit"}) ) { - assign_value($product_ref, $field, - $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}); + assign_value($product_ref, $field, $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}); } if (defined $product_ref->{$field}) { @@ -676,41 +629,39 @@ sub clean_weights ($product_ref) { # poids net total : 200g [zemetro] poids net égoutté : 140g contenance 212ml my %regexps = ( - fr => { - net_weight => '(poids )?net( total)?', - drained_weight => '(poids )?(net )?(égoutté|egoutte)', - volume => '(volume|contenance)( net|nette)?( total)?', - }, +fr => { +net_weight => '(poids )?net( total)?', +drained_weight => '(poids )?(net )?(égoutté|egoutte)', +volume => '(volume|contenance)( net|nette)?( total)?', +}, - # Peso neto: 480 g (6 x 80 g) Peso neto escurrido: 336 g (6x56 g) +# Peso neto: 480 g (6 x 80 g) Peso neto escurrido: 336 g (6x56 g) - es => { - net_weight => '(peso )?neto( total)?', - drained_weight => '(peso )?(neto )?(escurrido)', - #volume => '(volume|contenance)( net|nette)?( total)?', - }, +es => { +net_weight => '(peso )?neto( total)?', +drained_weight => '(peso )?(neto )?(escurrido)', +#volume => '(volume|contenance)( net|nette)?( total)?', +}, ); if (defined $product_ref->{total_weight}) { - $log->debug("clean_weights", {lc => $product_ref->{lc}, total_weight => $product_ref->{total_weight}}) - if $log->is_debug(); + $log->debug("clean_weights", { lc => $product_ref->{lc}, total_weight => $product_ref->{total_weight} }) if $log->is_debug(); if ((defined $product_ref->{lc}) and (defined $regexps{$product_ref->{lc}})) { foreach my $field ("net_weight", "drained_weight", "volume") { - if ( (not defined $product_ref->{$field}) - and (defined $regexps{$product_ref->{lc}}{$field})) - { + if ((not defined $product_ref->{$field}) + and (defined $regexps{$product_ref->{lc}}{$field})) { my $regexp = $regexps{$product_ref->{lc}}{$field}; - if ($product_ref->{total_weight} =~ /$regexp/i) { + if ($product_ref->{total_weight} =~ /$regexp/i ) { my $after = $'; # match number with unit - $log->debug("clean_weights - matched", {field => $field, after => $after}) if $log->is_debug(); + $log->debug("clean_weights - matched", { field => $field, after => $after }) if $log->is_debug(); if ($after =~ /\s?:?\s?(\d[0-9\.\,]+\s*(\w+))/i) { assign_value($product_ref, $field, $1); @@ -741,31 +692,25 @@ sub clean_weights ($product_ref) { } } + my $normalized_quantity; if (defined $product_ref->{quantity}) { $normalized_quantity = normalize_quantity($product_ref->{quantity}); } # empty or incomplete quantity, but net_weight etc. present - if ( - (not defined $product_ref->{quantity}) - or ($product_ref->{quantity} eq "") - or (not defined $normalized_quantity) - or ( ($product_ref->{lc} eq "fr") - and ($product_ref->{quantity} =~ /^\d+ tranche([[:alpha:]]*)$/)) # French : "6 tranches épaisses" - or ($product_ref->{quantity} =~ /^\(.+\)$/) # (4 x 125 g) - ) - { + if ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq "") or (not defined $normalized_quantity) + or (($product_ref->{lc} eq "fr") and ($product_ref->{quantity} =~ /^\d+ tranche([[:alpha:]]*)$/)) # French : "6 tranches épaisses" + or ($product_ref->{quantity} =~ /^\(.+\)$/) # (4 x 125 g) + ) { # See if we have other quantity related values: net_weight_value net_weight_unit drained_weight_value drained_weight_unit volume_value volume_unit my $extra_quantity; foreach my $field ("net_weight", "drained_weight", "total_weight", "volume") { - if ( (defined $product_ref->{$field}) - and ($product_ref->{$field} ne "") - and ($product_ref->{$field} =~ /^\d/)) - { # make sure we have a number + if ((defined $product_ref->{$field}) and ($product_ref->{$field} ne "") + and ($product_ref->{$field} =~ /^\d/) ) { # make sure we have a number $extra_quantity = $product_ref->{$field}; last; } @@ -789,6 +734,7 @@ sub clean_weights ($product_ref) { return; } + =head2 clean_fields ( $imported_product_ref ) This function: @@ -812,23 +758,32 @@ my %unspecified = ( 'unspecified', '(not|non)( |-|_)specified', 'not( |-|_)applicable', - 'na', 'n\/a', 'unknown', 'not( |-|_)known', + 'na', + 'n\/a', + 'unknown', + 'not( |-|_)known', + ], + 'es' => [ + 'no aplica', ], - 'es' => ['no aplica',], 'fr' => [ 'non( |-|_)(d(é|e)clar|indiqu|sp(é|e)cifi|renseign)(é|e)(e?)(s?)', 'ras|rien (à|a) signaler', - 'rien,n(é|e)ant', 'n\/r', 'nr', 'inconnu(e?)(s?)', 'non( |-|_)connu(e?)(s?)', + 'rien,n(é|e)ant', + 'n\/r', + 'nr', + 'inconnu(e?)(s?)', + 'non( |-|_)connu(e?)(s?)', ], ); -sub clean_fields ($product_ref) { +sub clean_fields($product_ref) { - $log->debug("clean_fields - start", {}) if $log->is_debug(); + $log->debug("clean_fields - start", { }) if $log->is_debug(); # Quantity in the product name? assign_quantity_from_field($product_ref, "product_name_" . $product_ref->{lc}); - + remove_quantity_from_field($product_ref, "product_name_" . $product_ref->{lc}); # Populate the quantity / weight fields from their quantity_value_unit, quantity_value, quantity_unit etc. components @@ -850,9 +805,13 @@ sub clean_fields ($product_ref) { foreach my $brand (split(/,/, $product_ref->{brands})) { $brand =~ s/^\s+//; $brand =~ s/\s+$//; + # we may get brands with quantifiers like * + ? etc. we need to escape them + $brand =~ s/(\*|\+|\?|\(|\)|\[|\]|\{|\}|\$|\^|\\)/\\/g; + # dashes/dots/spaces -> allow matching dashes/dot/spaces # e.g. "bons.mayennais" matches "bons mayennais" $brand =~ s/(\s|\.|-|_)/\(\\s|\\.|-|_\)/g; + $product_ref->{$field} =~ s/\s+$brand$//i; } } @@ -861,7 +820,7 @@ sub clean_fields ($product_ref) { foreach my $field (keys %{$product_ref}) { - $log->debug("clean_fields", {field => $field, value => $product_ref->{$field}}) if $log->is_debug(); + $log->debug("clean_fields", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); if (not defined $product_ref->{$field}) { print STDERR "undefined value for field $field\n"; @@ -903,18 +862,15 @@ sub clean_fields ($product_ref) { if (($product_ref->{$field} =~ /^(\s|-|\.|_)$/) and ($product_ref->{$field} ne '-')) { $product_ref->{$field} = ""; } - + # Remove "unspecified" values my @unspecified_lcs = ("en"); - if ( (defined $product_ref->{lc}) - and ($product_ref->{lc} ne 'en') - and (defined $unspecified{$product_ref->{lc}})) - { + if ((defined $product_ref->{lc}) and ($product_ref->{lc} ne 'en') and (defined $unspecified{$product_ref->{lc}})) { push @unspecified_lcs, $product_ref->{lc}; } - + foreach my $l (@unspecified_lcs) { - + foreach my $regexp (@{$unspecified{$l}}) { if ($product_ref->{$field} =~ /^\s*($regexp)\s*$/i) { if (defined $tags_fields{$field}) { @@ -936,15 +892,15 @@ sub clean_fields ($product_ref) { # FR 62.907.030 EC (DANS UN OVALE) $product_ref->{$field} =~ s/\(?dans un ovale\)?//ig; - + # FR 72.024.001 EC - FR 72 024 520 CE $product_ref->{$field} =~ s/ (CE|EC) - ([A-Z][A-Z]) / $1, $2/g; } - + # category with "organic" in it if ($field eq "categories") { - $product_ref->{$field} =~ s/^organic //i; # English - $product_ref->{$field} =~ s/ bio$//i; # French + $product_ref->{$field} =~ s/^organic //i; # English + $product_ref->{$field} =~ s/ bio$//i; # French } # Origin of ingredients that contains other things than tags (e.g. Leroux) @@ -967,21 +923,20 @@ sub clean_fields ($product_ref) { } if ($field =~ /^(ingredients_text|product_name|abbreviated_product_name|generic_name|brands)/) { - + # Lowercase fields in ALL CAPS # Capitalize all lowercase fields - + # do not count x4 as a lowercase letter # e.g. KINDER COUNTRY BARRE DE CEREALES ENROBEE DE CHOCOLAT 2x9 BARRES - + my $value = $product_ref->{$field}; $value =~ s/x(\d)/X$1/; $value =~ s/(\d)x/$1X/; - - if ( (($value =~ /[A-Z]{4}/) and ($value !~ /[a-z]/)) - or (($value =~ /[a-z]{4}/) and ($value !~ /[A-Z]/))) - { - + + if ((($value =~ /[A-Z]{4}/) and ($value !~ /[a-z]/)) + or (($value =~ /[a-z]{4}/) and ($value !~ /[A-Z]/)) ) { + # Tag field: uppercase the first letter (e.g. brands) if (defined $tags_fields{$field}) { $product_ref->{$field} = join(", ", map {ucfirst} split /, |,/, lc($product_ref->{$field})); @@ -989,33 +944,32 @@ sub clean_fields ($product_ref) { else { $product_ref->{$field} = ucfirst(lc($product_ref->{$field})); } - $log->debug("clean_fields - after lowercase", {field => $field, value => $product_ref->{$field}}) - if $log->is_debug(); + $log->debug("clean_fields - after lowercase", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); } - + # Remove fields with "0" if ($product_ref->{$field} ne '-') { $product_ref->{$field} =~ s/^( |0|-|_|\.|\/|\*|;)+$//; } - + # Remove HTML comments $product_ref->{$field} =~ s///sg; - + # if there's some HTML code, making special cases to try to repair is probably more dangerous than just ignoring the fields # e.g. )/) { $product_ref->{$field} = ""; } - + } - + # All fields # Ingredients if ($field =~ /^ingredients_text/) { - + # _x000D_ $product_ref->{$field} =~ s/_x000D_/\n/g; @@ -1043,38 +997,35 @@ sub clean_fields ($product_ref) { # extrait de malt d'orge - sel $product_ref->{$field} =~ s/ -( |)<\/b>/<\/b> -$1/ig; - $log->debug("clean_fields - ingredients_text - 1", {field => $field, value => $product_ref->{$field}}) - if $log->is_debug(); + $log->debug("clean_fields - ingredients_text - 1", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $product_ref->{$field} =~ s/(.*?)<\/b>/split_allergens($1)/iesg; $product_ref->{$field} =~ s/|<\/b>//ig; - $log->debug("clean_fields - ingredients_text - 2", {field => $field, value => $product_ref->{$field}}) - if $log->is_debug(); - + $log->debug("clean_fields - ingredients_text - 2", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + # Ingredients without separators # e.g. found in some CodeOnline data: "Ingrédients : Pur cacao de MadagascarŒufs fraisHuiles végétalesGélifiant végétalSucre" - + if ($product_ref->{$field} !~ /,|;| - /) { $product_ref->{$field} =~ s/(\p{Lower}\p{Lower}+)(?=\p{Upper}\p{Lower}\p{Lower})/$1, /g; } + if ($field eq "ingredients_text_fr") { # remove single sentence that say allergens are in bold (in Casino data) - $product_ref->{$field} - =~ s/(Les |l')?(information|ingrédient|indication)(s?) ([^\.,]*) (personnes )?((allergiques( (ou|et) intolérant(e|)s)?)|(intolérant(e|)s( (ou|et) allergiques)?))(\.)?//i; + $product_ref->{$field} =~ s/(Les |l')?(information|ingrédient|indication)(s?) ([^\.,]*) (personnes )?((allergiques( (ou|et) intolérant(e|)s)?)|(intolérant(e|)s( (ou|et) allergiques)?))(\.)?//i; $product_ref->{$field} = ucfirst($product_ref->{$field}); - $log->debug("clean_fields - ingredients_text - 3", {field => $field, value => $product_ref->{$field}}) - if $log->is_debug(); + $log->debug("clean_fields - ingredients_text - 3", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); # Missing spaces # Poire Williams - sucre de canne - sucre - gélifiant : pectines de fruits - acidifiant : acide citrique.Préparée avec 55 g de fruits pour 100 g de produit fini.Teneur totale en sucres 56 g pour 100 g de produit fini.Traces de _fruits à coque_ et de _lait_.. $product_ref->{$field} =~ s/\.([A-Z][a-z])/\. $1/g; - $log->debug("clean_fields - ingredients_text - 4", {field => $field, value => $product_ref->{$field}}) - if $log->is_debug(); + $log->debug("clean_fields - ingredients_text - 4", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); } @@ -1087,31 +1038,23 @@ sub clean_fields ($product_ref) { # _d'arachide_ # morceaux _d’amandes_ grillées - if ( ($field =~ /_fr/) - or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) - { + if (($field =~ /_fr/) or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) { $product_ref->{$field} =~ s/_(d|l)('|’)([^_,-;]+)_/$1'_$2_/ig; } } if ($field =~ /^ingredients_text_(\w\w)/) { my $ingredients_lc = $1; - $log->debug( - "clean_fields - before clean_ingredients_text_for_lang ", - {field => $field, value => $product_ref->{$field}} - ) if $log->is_debug(); + $log->debug("clean_fields - before clean_ingredients_text_for_lang ", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); $product_ref->{$field} = clean_ingredients_text_for_lang($product_ref->{$field}, $ingredients_lc); - $log->debug( - "clean_fields - after clean_ingredients_text_for_lang ", - {field => $field, value => $product_ref->{$field}} - ) if $log->is_debug(); + $log->debug("clean_fields - after clean_ingredients_text_for_lang ", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); } if ($field =~ /^nutriscore_grade_/) { $product_ref->{$field} = lc($product_ref->{$field}); } - + if ($field eq "nutriscore_grade_producer") { # Nutriscore_A -> a $product_ref->{$field} =~ s/(nutri-score|nutriscore)(\s|:|-|_|\.)+([a-e])/$3/i; @@ -1119,33 +1062,30 @@ sub clean_fields ($product_ref) { # remove N, N/A, NA etc. # but not "no", "none" that are useful values (e.g. for specific labels "organic:no", allergens : "none") - $product_ref->{$field} - =~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig; - + $product_ref->{$field} =~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig; + # remove none except for allergens and traces if ($field !~ /allergens|traces/) { - $product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig; + $product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig; } - if ( ($field =~ /_fr/) - or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) - { + if (($field =~ /_fr/) or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) { $product_ref->{$field} =~ s/^\s*(autre logo)?\s*$//ig; } $product_ref->{$field} =~ s/ +/ /g; $product_ref->{$field} =~ s/,(\s*),/,/g; $product_ref->{$field} =~ s/\.(\.+)$/\./; - + # Don't remove a single dash -, it is used to indicate an existing value should be deleted if ($product_ref->{$field} ne '-') { - + # Remove trailing dashes and commas $product_ref->{$field} =~ s/(\s|-|;|,)*$//; # Remove leading dashes, commas and dots # be careful not to turn -5 to 5: remove dashes only if they are not followed by a number $product_ref->{$field} =~ s/^(\s|-(?![0-9])|;|,|\.)+//; - + # Remove entries made entirely of punctuation characters, or x or X $product_ref->{$field} =~ s/^(x|X|,|;|-|_|\/|\\|#|:|\.|\s)+$//; } @@ -1167,6 +1107,7 @@ sub clean_fields ($product_ref) { return; } + sub clean_fields_for_all_products() { foreach my $code (sort keys %products) { @@ -1176,13 +1117,14 @@ sub clean_fields_for_all_products() { return; } -sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { + +sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { # $code can be undef or passed if we already know it from the file name # try to guess the code from the file name if ((not defined $code) and ($file =~ /\D(\d{13})\D/)) { $code = $1; - $log->info("inferring code from file name", {code => $code, file => $file}) if $log->is_info(); + $log->info("inferring code from file name", { code => $code, file => $file }) if $log->is_info(); } @@ -1190,21 +1132,21 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $code = normalize_code($code); } - $log->info("parsing xml file with XML::Rules", {file => $file, xml_rules => $xml_rules_ref}) if $log->is_info(); + $log->info("parsing xml file with XML::Rules", { file => $file, xml_rules => $xml_rules_ref }) if $log->is_info(); my $parser = XML::Rules->new(rules => $xml_rules_ref); my $xml_ref; - eval {$xml_ref = $parser->parse_file($file);}; + eval { $xml_ref = $parser->parse_file($file); }; if ($@ ne "") { - $log->error("error parsing xml file with XML::Rules", {file => $file, error => $@}) if $log->is_error(); + $log->error("error parsing xml file with XML::Rules", { file => $file, error=>$@ }) if $log->is_error(); push @xml_errors, $file; #exit; } - $log->trace("XML::Rules output", {file => $file, xml_ref => $xml_ref}) if $log->is_trace(); + $log->trace("XML::Rules output", { file => $file, xml_ref => $xml_ref }) if $log->is_trace(); # Skip empty XML files if (not defined $xml_ref) { @@ -1213,36 +1155,36 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { if ($log->is_trace()) { binmode STDOUT, ":encoding(UTF-8)"; - open(my $OUT_JSON, ">", "$www_root/data/import_debug_xml.json"); + open (my $OUT_JSON, ">", "$www_root/data/import_debug_xml.json"); print $OUT_JSON encode_json($xml_ref); - close($OUT_JSON); + close ($OUT_JSON); } # Some producers (e.g. Auchan) have multiple product codes in one file, with multiple label field values, # but without an actual id to make the mapping. - # - #- - # - # - # - # - # - # - # - # ... - #+ - #- - #Crème Dessert Chocolat Caramel Auchan x4 - #Chocolat Caramel x 4 - #Crème dessert Caram'choc - #Crème dessert aromatisée caramel chocolat - - # multiple_codes => { - # codes => codes, # all sub fields will be moved to the root of the split children - # fuzzy_match => "etiquettes", # if exists, specify a field that depends on the child - # fuzzy_from => "DenominationCommerciale", # value from "codes" that will be fuzzy matched to find the id for "fuzzy_match" hash - # }, +# +#- +# +# +# +# +# +# +# +# ... +#+ +#- +#Crème Dessert Chocolat Caramel Auchan x4 +#Chocolat Caramel x 4 +#Crème dessert Caram'choc +#Crème dessert aromatisée caramel chocolat + +# multiple_codes => { +# codes => codes, # all sub fields will be moved to the root of the split children +# fuzzy_match => "etiquettes", # if exists, specify a field that depends on the child +# fuzzy_from => "DenominationCommerciale", # value from "codes" that will be fuzzy matched to find the id for "fuzzy_match" hash +# }, my @xml_refs = (); @@ -1252,7 +1194,7 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { my $array = $xml_fields_mapping_ref->[0][1]; - $log->info("Split multiple products", {file => $file, array => $array}) if $log->is_info(); + $log->info("Split multiple products", { file => $file, array => $array }) if $log->is_info(); if (defined $xml_ref->{$array}) { my $i = 1; @@ -1268,10 +1210,11 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { } + # Multiple variant of one product, with different codes? elsif ($xml_fields_mapping_ref->[0][0] eq "multiple_codes") { - $log->info("Split multiple codes (product variants)", {file => $file}) if $log->is_info(); + $log->info("Split multiple codes (product variants)", { file => $file }) if $log->is_info(); my $codes = $xml_fields_mapping_ref->[0][1]{codes}; @@ -1286,7 +1229,7 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $fuzzy_match = $xml_fields_mapping_ref->[0][1]{fuzzy_match}; if (defined $xml_ref->{$fuzzy_match}) { @fuzzy_match_keys = sort keys %{$xml_ref->{$fuzzy_match}}; - @fuzzy_match_keysid = map {get_string_id_for_lang("no_language", $_)} @fuzzy_match_keys; + @fuzzy_match_keysid = map { get_string_id_for_lang("no_language", $_) } @fuzzy_match_keys; } } @@ -1295,15 +1238,14 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $new_code = normalize_code($new_code); - $log->info("Split multiple products - code", {code => $new_code}) if $log->is_info(); + $log->info("Split multiple products - code", { code => $new_code }) if $log->is_info(); my $new_xml_ref = dclone($xml_ref); $new_xml_ref->{code} = $new_code; foreach my $field (sort keys %{$xml_ref->{$codes}{$new_code}}) { - $log->info("Split multiple products - copy field", {code => $new_code, field => $field}) - if $log->is_info(); + $log->info("Split multiple products - copy field", { code => $new_code, field => $field }) if $log->is_info(); $new_xml_ref->{$field} = $xml_ref->{$codes}{$new_code}{$field}; } @@ -1313,19 +1255,17 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { my $fuzzy_from = $xml_fields_mapping_ref->[0][1]{fuzzy_from}; - $log->info("Fuzzy match", {fuzzy_from => $fuzzy_from}) if $log->is_info(); + $log->info("Fuzzy match", { fuzzy_from => $fuzzy_from }) if $log->is_info(); if (defined $new_xml_ref->{$fuzzy_from}) { - my $tf = Text::Fuzzy->new(get_string_id_for_lang("no_language", $new_xml_ref->{$fuzzy_from})); - my $nearestid = $tf->nearest(\@fuzzy_match_keysid); + my $tf = Text::Fuzzy->new (get_string_id_for_lang("no_language", $new_xml_ref->{$fuzzy_from})); + my $nearestid = $tf->nearest (\@fuzzy_match_keysid); my $nearest = $fuzzy_match_keys[$nearestid]; - $log->info("Fuzzy match found", - {fuzzy_from => $fuzzy_from, value => $new_xml_ref->{$fuzzy_from}, nearest => $nearest}) - if $log->is_info(); + $log->info("Fuzzy match found", { fuzzy_from => $fuzzy_from, value => $new_xml_ref->{$fuzzy_from}, nearest => $nearest }) if $log->is_info(); foreach my $field (sort keys %{$xml_ref->{$fuzzy_match}{$nearest}}) { - $log->info("Fuzzy match - copy field", {field => $field}) if $log->is_info(); + $log->info("Fuzzy match - copy field", { field => $field }) if $log->is_info(); $new_xml_ref->{$field} = $xml_ref->{$fuzzy_match}{$nearest}{$field}; } @@ -1345,15 +1285,15 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { push @xml_refs, $xml_ref; } - $log->info("Mapping XML fields", {file => $file}) if $log->is_info(); + $log->info("Mapping XML fields", { file => $file }) if $log->is_info(); - # my @xml_fields_mapping = ( - # - # # get the code first - # - # ["fields.AL_CODE_EAN.FR", "code"], - # ["ProductCode", "producer_version_id"], - # ["fields.AL_INGREDIENT.*", "ingredients_text_*"], +# my @xml_fields_mapping = ( +# +# # get the code first +# +# ["fields.AL_CODE_EAN.FR", "code"], +# ["ProductCode", "producer_version_id"], +# ["fields.AL_INGREDIENT.*", "ingredients_text_*"], # $code = undef; @@ -1361,161 +1301,149 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { foreach my $xml_ref (@xml_refs) { - my $product_ref; + my $product_ref; - if (defined $code) { - $product_ref = get_or_create_product_for_code($code); - } + if (defined $code) { + $product_ref = get_or_create_product_for_code($code); + } - foreach my $field_mapping_ref (@{$xml_fields_mapping_ref}) { - my $source = $field_mapping_ref->[0]; - my $target = $field_mapping_ref->[1]; + foreach my $field_mapping_ref (@{$xml_fields_mapping_ref}) { + my $source = $field_mapping_ref->[0]; + my $target = $field_mapping_ref->[1]; - $log->trace("source $i", {source => $source, target => $target}) if $log->is_trace(); + $log->trace("source $i", { source=>$source, target=>$target }) if $log->is_trace(); - my $current_tag = $xml_ref; + my $current_tag = $xml_ref; - print STDERR "\nsource: $source\n"; + print STDERR "\nsource: $source\n"; - foreach my $source_tag (split(/\./, $source)) { - print STDERR "source_tag: $source_tag\n"; + foreach my $source_tag (split(/\./, $source)) { + print STDERR "source_tag: $source_tag\n"; - # commands + # commands - # ["[delete_except]", "producer|emb_codes|origin"], + # ["[delete_except]", "producer|emb_codes|origin"], - if ($source_tag eq '[delete_except]') { - my $regexp = $target; - foreach my $field (sort keys %{$product_ref}) { - next if $field eq 'code'; - next if $field =~ /$regexp/i; - $log->trace("deleting existing field", {field => $field}) if $log->is_trace(); - delete $product_ref->{$field}; - } + if ($source_tag eq '[delete_except]') { + my $regexp = $target; + foreach my $field ( sort keys %{$product_ref}) { + next if $field eq 'code'; + next if $field =~ /$regexp/i; + $log->trace("deleting existing field", { field=>$field }) if $log->is_trace(); + delete $product_ref->{$field}; } + } - # multiple values in different languages + # multiple values in different languages - elsif ($source_tag eq '*') { - foreach my $tag (sort keys %{$current_tag}) { - my $tag_target = $target; + elsif ($source_tag eq '*') { + foreach my $tag ( sort keys %{$current_tag}) { + my $tag_target = $target; - # special case where we have something like allergens.nuts = traces - if ($tag_target eq "value_as_target_and_source_as_value") { - print STDERR "* tag key: $tag - target: $tag_target\n"; - if ( (defined $current_tag->{$tag}) - and (not ref($current_tag->{$tag})) - and ($current_tag->{$tag} ne '')) - { - print STDERR "assign $tag to $current_tag->{$tag}\n"; + # special case where we have something like allergens.nuts = traces + if ($tag_target eq "value_as_target_and_source_as_value") { + print STDERR "* tag key: $tag - target: $tag_target\n"; + if ((defined $current_tag->{$tag}) and (not ref($current_tag->{$tag})) and ($current_tag->{$tag} ne '')) { + print STDERR "assign $tag to $current_tag->{$tag}\n"; - assign_value($product_ref, $current_tag->{$tag}, $tag); - } + assign_value($product_ref, $current_tag->{$tag}, $tag); } - else { + } + else { - $tag_target =~ s/\*/$tag/; - $tag_target = lc($tag_target); - print STDERR "* tag key: $tag - target: $tag_target\n"; - if ( (defined $current_tag->{$tag}) - and (not ref($current_tag->{$tag})) - and ($current_tag->{$tag} ne '')) - { - print STDERR - "$tag value is a scalar: $current_tag->{$tag}, assign value to $tag_target\n"; - if ($tag_target eq 'code') { - $code = $current_tag->{$tag}; - - $code = normalize_code($code); - $product_ref = get_or_create_product_for_code($code); - } - assign_value($product_ref, $tag_target, $current_tag->{$tag}); + $tag_target =~ s/\*/$tag/; + $tag_target = lc($tag_target); + print STDERR "* tag key: $tag - target: $tag_target\n"; + if ((defined $current_tag->{$tag}) and (not ref($current_tag->{$tag})) and ($current_tag->{$tag} ne '')) { + print STDERR "$tag value is a scalar: $current_tag->{$tag}, assign value to $tag_target\n"; + if ($tag_target eq 'code') { + $code = $current_tag->{$tag}; - if ($tag_target eq 'emb_codes') { - print STDERR "emb_codes : " . $product_ref->{$tag_target} . "\n"; - } + $code = normalize_code($code); + $product_ref = get_or_create_product_for_code($code); + } + assign_value($product_ref, $tag_target, $current_tag->{$tag}); + + if ($tag_target eq 'emb_codes') { + print STDERR "emb_codes : " . $product_ref->{$tag_target} . "\n"; } } } - last; } + last; + } - # Array - e.g. ["nutrients.ENERKJ.[0].RoundValue", "nutriments.energy_kJ"], + # Array - e.g. ["nutrients.ENERKJ.[0].RoundValue", "nutriments.energy_kJ"], - elsif ($source_tag =~ /^\[(\d+)\]$/) { - my $i = $1; - if ((ref($current_tag) eq 'ARRAY') and (defined $current_tag->[$i])) { - print STDERR "going down to array element $source_tag - $i\n"; - $current_tag = $current_tag->[$i]; - } + elsif ($source_tag =~ /^\[(\d+)\]$/) { + my $i = $1; + if ((ref($current_tag) eq 'ARRAY') and (defined $current_tag->[$i])) { + print STDERR "going down to array element $source_tag - $i\n"; + $current_tag = $current_tag->[$i]; } + } - # Array with several versions identified by a number, take the highest one - # {$version}) and (not defined $max) - or ($version_ref->{$version} > $max)) - { - $max = $version_ref->{$version}; - $max_version_ref = $version_ref; - } - } - if (defined $max_version_ref) { - print STDERR "going down to array element $source_tag - version $max\n"; - $current_tag = $max_version_ref; + # Array with several versions identified by a number, take the highest one + # {$version}) and (not defined $max) or ($version_ref->{$version} > $max)) { + $max = $version_ref->{$version}; + $max_version_ref = $version_ref; } } + if (defined $max_version_ref) { + print STDERR "going down to array element $source_tag - version $max\n"; + $current_tag = $max_version_ref; + } } + } - elsif (defined $current_tag->{$source_tag}) { - if ((ref($current_tag->{$source_tag}) eq 'HASH') or (ref($current_tag->{$source_tag}) eq 'ARRAY')) { - print STDERR "going down to hash $source_tag\n"; - $current_tag = $current_tag->{$source_tag}; - } - elsif ( (defined $current_tag->{$source_tag}) - and (not ref($current_tag->{$source_tag})) - and ($current_tag->{$source_tag} ne '')) - { + elsif (defined $current_tag->{$source_tag}) { + if ((ref($current_tag->{$source_tag}) eq 'HASH') or (ref($current_tag->{$source_tag}) eq 'ARRAY')) { + print STDERR "going down to hash $source_tag\n"; + $current_tag = $current_tag->{$source_tag}; + } + elsif ((defined $current_tag->{$source_tag}) and (not ref($current_tag->{$source_tag})) and ($current_tag->{$source_tag} ne '')) { - my $value = $current_tag->{$source_tag}; + my $value = $current_tag->{$source_tag}; - print STDERR "$source_tag is a scalar: $value, assign value to $target\n"; - if ($target eq 'code') { - $code = $value; - $code = normalize_code($code); - $product_ref = get_or_create_product_for_code($code); - } + print STDERR "$source_tag is a scalar: $value, assign value to $target\n"; + if ($target eq 'code') { + $code = $value; + $code = normalize_code($code); + $product_ref = get_or_create_product_for_code($code); + } - my $seen_energy_kj = 0; + my $seen_energy_kj = 0; - if ($target =~ /^nutriments.(.*)/) { - $target = $1; + if ($target =~ /^nutriments.(.*)/) { + $target = $1; - # skip energy in kcal if we already have energy in kJ - if (($seen_energy_kj) and ($target =~ /kcal/i)) { - next; - } + # skip energy in kcal if we already have energy in kJ + if (($seen_energy_kj) and ($target =~ /kcal/i)) { + next; + } - if ($target =~ /kj/i) { - $seen_energy_kj = 1; - } + if ($target =~ /kj/i) { + $seen_energy_kj = 1; + } - $value =~ s/,/\./; + $value =~ s/,/\./; - if ($target =~ /^(.*)_value$/) { - assign_value($product_ref, $target, $value); - } - elsif ($target =~ /^(.*)_unit$/) { - assign_value($product_ref, $target, $value); - } - elsif ($target =~ /^(.*)_([^_]+)$/) { + if ($target =~ /^(.*)_value$/) { + assign_value($product_ref, $target, $value); + } + elsif ($target =~ /^(.*)_unit$/) { + assign_value($product_ref, $target, $value); + } + elsif ($target =~ /^(.*)_([^_]+)$/) { $target = $1; my $unit = $2; assign_value($product_ref, $target . "_value", $value); @@ -1525,30 +1453,31 @@ sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { else { assign_value($product_ref, $target . "_unit", ""); } - } - else { - assign_value($product_ref, $target . "_value", $value); - } } else { - assign_value($product_ref, $target, $value); + assign_value($product_ref, $target . "_value", $value); } } - } - else { - last; + else { + assign_value($product_ref, $target, $value); + } } } - - $i++; + else { + last; + } } - } #foreach @xml_refs + $i++; + } + + } #foreach @xml_refs return 0; } -sub load_csv_file ($options_ref) { + +sub load_csv_file($options_ref) { my $file = $options_ref->{file}; my $encoding = $options_ref->{encoding}; @@ -1561,37 +1490,37 @@ sub load_csv_file ($options_ref) { # e.g. load_csv_file($file, "UTF-8", "\t", 4); - $log->info("Loading CSV file", {file => $file}) if $log->is_info(); + $log->info("Loading CSV file", { file => $file }) if $log->is_info(); - my $csv_options_ref = {binary => 1, sep_char => $separator}; + my $csv_options_ref = { binary => 1 , sep_char => $separator }; if (defined $options_ref->{escape_char}) { $csv_options_ref->{escape_char} = $options_ref->{escape_char}; } - my $csv = Text::CSV->new($csv_options_ref) # should set binary attribute. - or die "Cannot use CSV: " . Text::CSV->error_diag(); + my $csv = Text::CSV->new ( $csv_options_ref ) # should set binary attribute. + or die "Cannot use CSV: " . Text::CSV->error_diag (); - open(my $io, "<:encoding($encoding)", $file) or die("Could not open $file: $!"); + open (my $io, "<:encoding($encoding)", $file) or die("Could not open $file: $!"); my $i = 0; # line number if (defined $skip_lines) { $log->info("Skipping $skip_lines lines before header") if $log->is_info(); for ($i = 0; $i < $skip_lines; $i++) { - $csv->getline($io); + $csv->getline ($io); } } #my $headers_ref = $csv->getline ($io); $i++; - $csv->header($io, {detect_bom => 1}); + $csv->header ($io, { detect_bom => 1 }); if (defined $skip_lines_after_header) { $log->info("Skipping $skip_lines_after_header lines after header") if $log->is_info(); for (my $j = 0; $j < $skip_lines_after_header; $j++) { - $csv->getline($io); + $csv->getline ($io); $i++; } } @@ -1602,9 +1531,9 @@ sub load_csv_file ($options_ref) { my $product_ref; - while (my $csv_product_ref = $csv->getline_hr($io)) { + while (my $csv_product_ref = $csv->getline_hr ($io)) { - $i++; # line number + $i++; # line number $log->info("Reading line $i") if $log->is_info(); @@ -1630,9 +1559,8 @@ sub load_csv_file ($options_ref) { my $source_condition_field = $field_mapping_ref->[$condition][0]; my $source_condition_value = $field_mapping_ref->[$condition][1]; - if ( (not defined $csv_product_ref->{$source_condition_field}) - or ($csv_product_ref->{$source_condition_field} ne $source_condition_value)) - { + if ((not defined $csv_product_ref->{$source_condition_field}) + or ($csv_product_ref->{$source_condition_field} ne $source_condition_value)) { $match = 0; } @@ -1655,10 +1583,7 @@ sub load_csv_file ($options_ref) { print STDERR "skipping invalid code\n"; last; } - elsif ( (defined $skip_non_existing_products) - and ($skip_non_existing_products) - and (not exists $products{$code})) - { + elsif ((defined $skip_non_existing_products) and ($skip_non_existing_products) and (not exists $products{$code})) { print STDERR "skipping non existing product\n"; last; } @@ -1680,7 +1605,7 @@ sub load_csv_file ($options_ref) { my $dir = $'; $dir =~ s/\/$//; - my $file = $csv_product_ref->{$source_field}; + my $file = $csv_product_ref->{$source_field}; $file =~ s/.*\///; $file =~ s/[^A-Za-z0-9-_\.]/_/g; @@ -1691,7 +1616,7 @@ sub load_csv_file ($options_ref) { # do not download again images that we already have # but try again if the size is 0 - if ((!-e "$dir/$file") or ((-s "$dir/$file") < 10000)) { + if ((! -e "$dir/$file") or ((-s "$dir/$file") < 10000)) { print STDERR "downloading image: wget $csv_product_ref->{$source_field} -O $dir/$file\n"; system("wget \"" . $csv_product_ref->{$source_field} . "\" -O $dir/$file"); @@ -1714,15 +1639,15 @@ sub load_csv_file ($options_ref) { } if ($target_field =~ /^(.*)_([^_]+)$/) { - $target_field = $1; - my $unit = $2; - assign_value($product_ref, $target_field . "_value", $value); - if ($value ne "") { - assign_value($product_ref, $target_field . "_unit", $unit); - } - else { - assign_value($product_ref, $target_field . "_unit", ""); - } + $target_field = $1; + my $unit = $2; + assign_value($product_ref, $target_field . "_value", $value); + if ($value ne "") { + assign_value($product_ref, $target_field . "_unit", $unit); + } + else { + assign_value($product_ref, $target_field . "_unit", ""); + } } else { assign_value($product_ref, $target_field . "_value", $value); @@ -1743,9 +1668,7 @@ sub load_csv_file ($options_ref) { } } else { - $log->error("undefined source field", - {line => $i, source_field => $source_field, csv_product_ref => $csv_product_ref}) - if $log->is_error(); + $log->error("undefined source field", { line => $i, source_field=>$source_field, csv_product_ref=>$csv_product_ref }) if $log->is_error(); die; } } @@ -1755,7 +1678,7 @@ sub load_csv_file ($options_ref) { return; } -sub recursive_list ($list_ref, $arg) { +sub recursive_list($list_ref, $arg) { if (-d $arg) { @@ -1763,8 +1686,8 @@ sub recursive_list ($list_ref, $arg) { print STDERR "Opening dir $dir\n"; - if (opendir(DH, "$dir")) { - foreach my $file (sort {$a cmp $b} readdir(DH)) { + if (opendir (DH, "$dir")) { + foreach my $file (sort { $a cmp $b } readdir(DH)) { next if (($file eq '.') or ($file eq '..')); @@ -1772,7 +1695,7 @@ sub recursive_list ($list_ref, $arg) { } } - closedir(DH); + closedir (DH); } else { push @{$list_ref}, $arg; @@ -1781,7 +1704,7 @@ sub recursive_list ($list_ref, $arg) { return; } -sub get_list_of_files (@files_and_dirs) { +sub get_list_of_files(@files_and_dirs) { # Read the list of files or directories passed as parameters @@ -1797,13 +1720,14 @@ sub get_list_of_files (@files_and_dirs) { return @files; } -sub print_csv_file ($file_handle) { - my $csv_out - = Text::CSV->new({binary => 1, sep_char => "\t", eol => "\n", quote_space => 0}) # should set binary attribute. - or die "Cannot use CSV: " . Text::CSV->error_diag(); - $csv_out->print($file_handle, \@fields); +sub print_csv_file($file_handle) { + + my $csv_out = Text::CSV->new ( { binary => 1 , sep_char => "\t", eol => "\n", quote_space => 0 } ) # should set binary attribute. + or die "Cannot use CSV: ".Text::CSV->error_diag (); + + $csv_out->print ($file_handle, \@fields) ; foreach my $code (sort keys %products) { @@ -1819,7 +1743,7 @@ sub print_csv_file ($file_handle) { } } - $csv_out->print($file_handle, \@values); + $csv_out->print ($file_handle, \@values) ; print STDERR "code: $code\n"; } @@ -1827,6 +1751,7 @@ sub print_csv_file ($file_handle) { return; } + sub print_stats() { my %existing_values = (); @@ -1855,6 +1780,7 @@ sub print_stats() { return; } + =head2 extract_nutrition_facts_from_text ( LC, TEXT, NUTRIENTS_REF ) C extract nutrition facts from a text @@ -1886,7 +1812,7 @@ Reference to a scalar that will be set to the serving size if the nutrition fact =cut -sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutrition_data_per_ref, $serving_size_ref) { +sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutrition_data_per_ref, $serving_size_ref) { if ((defined $text) and ($text ne "")) { @@ -1899,10 +1825,7 @@ sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutriti } } elsif ($text_lc eq "fr") { - if ($text - =~ /^\s*(à la |a la |pour |par |)(1 |une )?portion (de |d'environ )?\(? ?(\d+((\.|,)\d+)? ?(g|kg|mg|µg|l|dl|cl|ml))/i - ) - { + if ($text =~ /^\s*(à la |a la |pour |par |)(1 |une )?portion (de |d'environ )?\(? ?(\d+((\.|,)\d+)? ?(g|kg|mg|µg|l|dl|cl|ml))/i) { ${$nutrition_data_per_ref} = "serving"; ${$serving_size_ref} = $4; } @@ -1943,9 +1866,7 @@ sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutriti # Vitamine D µg 0.4 soit 8 % des AQR* - if ($text - =~ /\b$synonym\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)/i) - { + if ($text =~ /\b$synonym\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)/i) { $unit = $1; $value = $5; if ((defined $3) and ($3 ne "")) { @@ -1973,9 +1894,7 @@ sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutriti $value = 0; last; } - elsif ($text - =~ /\b$synonym(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?/i) - { + elsif ($text =~ /\b$synonym(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?/i) { $value = $4; $unit = $7; if ((defined $2) and ($2 ne "")) { @@ -1990,9 +1909,7 @@ sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutriti last; } # missing unit... assume g ? - elsif ($text - =~ /\b$synonym(\s|:)+(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)?\)?\b/i) - { + elsif ($text =~ /\b$synonym(\s|:)+(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)?\)?\b/i) { $value = $4; $unit = "g"; if ((defined $2) and ($2 ne "")) { @@ -2041,5 +1958,7 @@ sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutriti return; } + + 1; From ee8534ed5dc13deae8bb235e7f446277bea34f31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Fri, 3 Feb 2023 17:25:11 +0100 Subject: [PATCH 5/8] lint --- lib/ProductOpener/GS1.pm | 1086 ++++++++++++++++------------ lib/ProductOpener/ImportConvert.pm | 847 ++++++++++++---------- 2 files changed, 1098 insertions(+), 835 deletions(-) diff --git a/lib/ProductOpener/GS1.pm b/lib/ProductOpener/GS1.pm index d562ab54ef55e..ab4dee4aaf0bb 100644 --- a/lib/ProductOpener/GS1.pm +++ b/lib/ProductOpener/GS1.pm @@ -42,14 +42,12 @@ And the %gs1_maps translate the GS1 specific identifiers (e.g. for allergens or package ProductOpener::GS1; use ProductOpener::PerlStandards; -use Exporter qw< import >; +use Exporter qw< import >; use Log::Any qw($log); - -BEGIN -{ - use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); +BEGIN { + use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); @EXPORT_OK = qw( %gs1_maps @@ -65,7 +63,7 @@ BEGIN %EXPORT_TAGS = (all => [@EXPORT_OK]); } -use vars @EXPORT_OK ; +use vars @EXPORT_OK; use ProductOpener::Config qw/:all/; use ProductOpener::Tags qw/:all/; @@ -75,7 +73,6 @@ use JSON::PP; use boolean; use Data::DeepAccess qw(deep_get); - =head1 GS1 MAPS GS1 uses many different codes for allergens, packaging etc. @@ -93,7 +90,6 @@ Maps from GS1 to OFF =cut - my %unknown_entries_in_gs1_maps = (); # see https://www.gs1.fr/content/download/2265/17736/version/3/file/FicheProduit3.1.9_PROFIL_ParfumerieSelective_20190523.xlsx @@ -138,7 +134,7 @@ my %unknown_entries_in_gs1_maps = (); "UW" => "Wheat", "X99" => "None", }, - + measurementUnitCode => { "GRM" => "g", "KGM" => "kg", @@ -151,7 +147,7 @@ my %unknown_entries_in_gs1_maps = (); "KJO" => "kJ", "H87" => "pièces", }, - + # reference: GS1 T4073 Nutrient type code # https://gs1.se/en/guides/documentation/test-code-lists/t4073-nutrient-type-code/ nutrientTypeCode => { @@ -159,7 +155,7 @@ my %unknown_entries_in_gs1_maps = (); "CA" => "calcium", "CASN" => "casein", "CHOAVL" => "carbohydrates", - "CHOCAL" => "vitamin-d", # cholecalciferol + "CHOCAL" => "vitamin-d", # cholecalciferol "CHOLN" => "choline", "CLD" => "chloride", "CR" => "chromium", @@ -210,8 +206,8 @@ my %unknown_entries_in_gs1_maps = (); "NUCLEOTIDE" => "nucleotide", "P" => "phosphorus", "PANTAC" => "pantothenic-acid", - "POLYL" => "polyols", - "POLYLS" => "polyols", + "POLYL" => "polyols", + "POLYLS" => "polyols", "PRO-" => "proteins", "RIBF" => "vitamin-b2", "SALTEQ" => "salt", @@ -254,8 +250,8 @@ my %unknown_entries_in_gs1_maps = (); "PUG" => "en:carrying-bag", "TU" => "en:tube", "WRP" => "en:film", - }, - + }, + # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:PackagingMarkedLabelAccreditationCode packagingMarkedLabelAccreditationCode => { "ADCCPA" => "fr:produit-certifie", @@ -283,7 +279,7 @@ my %unknown_entries_in_gs1_maps = (); "FAIR_TRADE_MARK" => "en:fairtrade-international", "FAIRTRADE_COCOA" => "en:fair-trade", "FAIR_TRADE_USA" => "en:fairtrade-usa", - "FOREST_STEWARDSHIP_COUNCIL_LABEL" => "en:fsc", + "FOREST_STEWARDSHIP_COUNCIL_LABEL" => "en:fsc", "FOREST_STEWARDSHIP_COUNCIL_MIX" => "en:fsc-mix", "FOREST_STEWARDSHIP_COUNCIL_RECYCLED" => "en:fsc-recycled", "GREEN_DOT" => "en:green-dot", @@ -311,7 +307,7 @@ my %unknown_entries_in_gs1_maps = (); "VOLAILLE_FRANCAISE" => "en:french-poultry", }, - # https://gs1.se/en/guides/documentation/code-lists/t3783-target-market-country-code/ + # https://gs1.se/en/guides/documentation/code-lists/t3783-target-market-country-code/ targetMarketCountryCode => { "040" => "en:austria", "056" => "en:belgium", @@ -321,11 +317,11 @@ my %unknown_entries_in_gs1_maps = (); "724" => "en:spain", "756" => "en:switzerland", }, - + timeMeasurementUnitCode => { "MON" => "month", "DAY" => "day", - }, + }, ); # Normalize some entries @@ -337,8 +333,9 @@ foreach my $tag (sort keys %{$gs1_maps{allergenTypeCode}}) { } else { $log->error("gs1_maps - entry not in taxonomy", - { tagtype => "allergens", tag => $gs1_maps{allergenTypeCode}{$tag} }) if $log->is_error(); - die; + {tagtype => "allergens", tag => $gs1_maps{allergenTypeCode}{$tag}}) + if $log->is_error(); + die; } } @@ -349,12 +346,12 @@ foreach my $tag (sort keys %{$gs1_maps{packagingMarkedLabelAccreditationCode}}) } else { $log->error("gs1_maps - entry not in taxonomy", - { tagtype => "labels", tag => $gs1_maps{packagingMarkedLabelAccreditationCode}{$tag} }) if $log->is_error(); - die; + {tagtype => "labels", tag => $gs1_maps{packagingMarkedLabelAccreditationCode}{$tag}}) + if $log->is_error(); + die; } } - =head2 %gs1_message_to_off Defines the structure of the GS1 message data and how to extract the fields useful to create a message confirmation. @@ -365,40 +362,62 @@ my %gs1_message_to_off = ( fields => [ - ["catalogue_item_notification:catalogueItemNotificationMessage", { + [ + "catalogue_item_notification:catalogueItemNotificationMessage", + { fields => [ - ["sh:StandardBusinessDocumentHeader", { + [ + "sh:StandardBusinessDocumentHeader", + { fields => [ ], } ], - ["transaction", { + [ + "transaction", + { fields => [ - ["transactionIdentification", { + [ + "transactionIdentification", + { fields => [ ["entityIdentification", "transactionIdentification_entityIdentification"], - ["contentOwner", { - fields => [ - ["gln", "transactionIdentification_contentOwner_gln"], - ], + [ + "contentOwner", + { + fields => [["gln", "transactionIdentification_contentOwner_gln"],], } ], ], }, ], - ["documentCommand", { + [ + "documentCommand", + { fields => [ - ["documentCommandHeader", { + [ + "documentCommandHeader", + { fields => [ - ["documentCommandIdentification", { + [ + "documentCommandIdentification", + { fields => [ - ["entityIdentification", "documentCommandIdentification_entityIdentification"], - ["contentOwner", { + [ + "entityIdentification", + "documentCommandIdentification_entityIdentification" + ], + [ + "contentOwner", + { fields => [ - ["gln", "documentCommandIdentification_contentOwner_gln"], + [ + "gln", + "documentCommandIdentification_contentOwner_gln" + ], ], } ], @@ -410,30 +429,56 @@ my %gs1_message_to_off = ( }, ], - ["catalogue_item_notification:catalogueItemNotification", { + [ + "catalogue_item_notification:catalogueItemNotification", + { fields => [ - ["creationDateTime", "catalogueItemNotification_creationDateTime"], - ["documentStatusCode", "catalogueItemNotification_documentStatusCode"], - ["catalogueItemNotificationIdentification", { + [ + "creationDateTime", + "catalogueItemNotification_creationDateTime" + ], + [ + "documentStatusCode", + "catalogueItemNotification_documentStatusCode" + ], + [ + "catalogueItemNotificationIdentification", + { fields => [ - ["entityIdentification", "catalogueItemNotificationIdentification_entityIdentification"], - ["contentOwner", { + [ + "entityIdentification", + "catalogueItemNotificationIdentification_entityIdentification" + ], + [ + "contentOwner", + { fields => [ - ["gln", "catalogueItemNotificationIdentification_contentOwner_gln"], + [ + "gln", + "catalogueItemNotificationIdentification_contentOwner_gln" + ], ], } ], ], }, ], - ["catalogueItem", { + [ + "catalogueItem", + { fields => [ - ["tradeItem", { + [ + "tradeItem", + { fields => [ ["gtin", "gtin"], - ["targetMarket", { + [ + "targetMarket", + { fields => [ - ["targetMarketCountryCode", "targetMarketCountryCode"], + ["targetMarketCountryCode", + "targetMarketCountryCode" + ], ], }, ], @@ -458,7 +503,6 @@ my %gs1_message_to_off = ( ], ); - =head2 %gs1_product_to_off Defines the structure of the GS1 product data and how it maps to the OFF data. @@ -467,17 +511,17 @@ Defines the structure of the GS1 product data and how it maps to the OFF data. my %gs1_product_to_off = ( - match => [ - ["isTradeItemAConsumerUnit", "true"], - ], + match => [["isTradeItemAConsumerUnit", "true"],], fields => [ - + # source_field => target_field : assign the value of the source field to the target field ["gtin", "code"], # source_field => source_hash : go down one level - ["brandOwner", { + [ + "brandOwner", + { fields => [ ["gln", "sources_fields:org-gs1:gln"], # source_field => target_field1,target_field2 : assign value of the source field to multiple target fields @@ -485,18 +529,22 @@ my %gs1_product_to_off = ( ], }, ], - - ["gdsnTradeItemClassification", { + + [ + "gdsnTradeItemClassification", + { fields => [ ["gpcCategoryCode", "sources_fields:org-gs1:gpcCategoryCode"], # not always present and could be in different languages ["gpcCategoryName", "sources_fields:org-gs1:gpcCategoryName, +categories_if_match_in_taxonomy"], ], }, - ], - + ], + # will override brandOwner values if present - ["informationProviderOfTradeItem", { + [ + "informationProviderOfTradeItem", + { fields => [ ["gln", "sources_fields:org-gs1:gln"], # source_field => target_field1,target_field2 : assign value of the source field to multiple target fields @@ -504,143 +552,167 @@ my %gs1_product_to_off = ( ], }, ], - - ["targetMarket", { - fields => [ - ["targetMarketCountryCode", "countries%targetMarketCountryCode"], - ], + + [ + "targetMarket", + { + fields => [["targetMarketCountryCode", "countries%targetMarketCountryCode"],], }, ], - + # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 # source_field => array of hashes: go down one level, expect an array - ["tradeItemContactInformation", [ + [ + "tradeItemContactInformation", + [ { # match => hash of key value conditions: assign values to field only if the conditions match - match => [ - ["contactTypeCode", "CXC"], - ], - fields => [ - ["contactName", "customer_service_fr"], - ["contactAddress", "+customer_service_fr"], - ], + match => [["contactTypeCode", "CXC"],], + fields => [["contactName", "customer_service_fr"], ["contactAddress", "+customer_service_fr"],], }, ], ], - - ["tradeItemInformation", + + [ + "tradeItemInformation", { fields => [ # Sometimes contains strings like "Signal CLAY&CHARCOAL DENTIFRICE 75 ML", not a good fit for the producer_version_id # but other time contains strings that look like internal version ids / item ids (e.g. "44041392") - ["productionVariantDescription", "sources_fields:org-gs1:productionVariantDescription, producer_version_id"], - - ["extension", { + [ + "productionVariantDescription", + "sources_fields:org-gs1:productionVariantDescription, producer_version_id" + ], + + [ + "extension", + { fields => [ - ["alcohol_information:alcoholInformationModule", { + [ + "alcohol_information:alcoholInformationModule", + { fields => [ - ["alcoholInformation", { - fields => [ - ["percentageOfAlcoholByVolume", "alcohol_100g_value"], - ], + [ + "alcoholInformation", + { + fields => [["percentageOfAlcoholByVolume", "alcohol_100g_value"],], }, ], ], }, - ], - - ["allergen_information:allergenInformationModule", { + ], + + [ + "allergen_information:allergenInformationModule", + { fields => [ - ["allergenRelatedInformation", { + [ + "allergenRelatedInformation", + { fields => [ - ["allergen", [ + [ + "allergen", + [ { - match => [ - ["levelOfContainmentCode", "CONTAINS"], - ], + match => [["levelOfContainmentCode", "CONTAINS"],], fields => [ # source_field => +target_field' : add to field, separate with commas if field is not empty # source_field => target_field%map_id : map the target value using the specified map_id # (do not assign a value if there is no corresponding entry in the map) - ['allergenTypeCode', '+allergens%allergenTypeCode'], + [ + 'allergenTypeCode', + '+allergens%allergenTypeCode' + ], ], }, { - match => [ - ["levelOfContainmentCode", "MAY_CONTAIN"], - ], + match => + [["levelOfContainmentCode", "MAY_CONTAIN"],], fields => [ # source_field => +target_field' : add to field, separate with commas if field is not empty # source_field => target_field%map_id : map the target value using the specified map_id # (do not assign a value if there is no corresponding entry in the map) - ['allergenTypeCode', '+traces%allergenTypeCode'], + [ + 'allergenTypeCode', + '+traces%allergenTypeCode' + ], ], }, ], ], - ["isAllergenRelevantDataProvided", "sources_fields:org-gs1:isAllergenRelevantDataProvided"], + [ + "isAllergenRelevantDataProvided", + "sources_fields:org-gs1:isAllergenRelevantDataProvided" + ], ], }, ], ], }, ], - - ["nutritional_information:nutritionalInformationModule", { + + [ + "nutritional_information:nutritionalInformationModule", + { fields => [ - ["nutrientHeader"], # nutrients are handled specially with specific code + ["nutrientHeader"], # nutrients are handled specially with specific code ], }, ], - - ["consumer_instructions:consumerInstructionsModule", { + + [ + "consumer_instructions:consumerInstructionsModule", + { fields => [ - ["consumerInstructions", { - fields => [ - ["consumerStorageInstructions", "conservation_conditions"], - ], + [ + "consumerInstructions", + { + fields => + [["consumerStorageInstructions", "conservation_conditions"],], }, ], ], } ], - - ["food_and_beverage_ingredient:foodAndBeverageIngredientModule", { - fields => [ - ["ingredientStatement", "ingredients_text"], - ], + + [ + "food_and_beverage_ingredient:foodAndBeverageIngredientModule", + { + fields => [["ingredientStatement", "ingredients_text"],], }, ], - - ["nonfood_ingredient:nonfoodIngredientModule", { - fields => [ - ["nonfoodIngredientStatement", "ingredients_text"], - ], + + [ + "nonfood_ingredient:nonfoodIngredientModule", + { + fields => [["nonfoodIngredientStatement", "ingredients_text"],], }, - ], - - ["food_and_beverage_preparation_serving:foodAndBeveragePreparationServingModule", { + ], + + [ + "food_and_beverage_preparation_serving:foodAndBeveragePreparationServingModule", + { fields => [ - ["preparationServing", { - fields => [ - ["preparationInstructions", "preparation"], - ], + [ + "preparationServing", + { + fields => [["preparationInstructions", "preparation"],], }, ], ], }, ], - - ["health_related_information:healthRelatedInformationModule", { + + [ + "health_related_information:healthRelatedInformationModule", + { fields => [ - ["healthRelatedInformation", { - match => [ - ["nutritionalProgramCode","8"], - ], - fields => [ - ["nutritionalScore", "nutriscore_grade_producer"], - ], + [ + "healthRelatedInformation", + { + match => [["nutritionalProgramCode", "8"],], + fields => [["nutritionalScore", "nutriscore_grade_producer"],], }, ], ], @@ -648,14 +720,18 @@ my %gs1_product_to_off = ( ], # 2021-12-20: it looks like the nutritionalProgramCode is now in an extra nutritionProgram field - ["health_related_information:healthRelatedInformationModule", { + [ + "health_related_information:healthRelatedInformationModule", + { fields => [ - ["healthRelatedInformation", { + [ + "healthRelatedInformation", + { fields => [ - ["nutritionalProgram", { - match => [ - ["nutritionalProgramCode","8"], - ], + [ + "nutritionalProgram", + { + match => [["nutritionalProgramCode", "8"],], fields => [ ["nutritionalScore", "nutriscore_grade_producer"], ], @@ -667,25 +743,34 @@ my %gs1_product_to_off = ( ], }, ], - - ["packaging_information:packagingInformationModule", { + + [ + "packaging_information:packagingInformationModule", + { fields => [ - ["packaging", { - fields => [ - ["packagingTypeCode", "+packaging%packagingTypeCode"], - ], + [ + "packaging", + { + fields => [["packagingTypeCode", "+packaging%packagingTypeCode"],], }, ], ], }, - ], - - ["packaging_marking:packagingMarkingModule", { + ], + + [ + "packaging_marking:packagingMarkingModule", + { fields => [ - ["packagingMarking", { + [ + "packagingMarking", + { fields => [ # the source can be an array if there are multiple labels - ["packagingMarkedLabelAccreditationCode", "+labels%packagingMarkedLabelAccreditationCode"], + [ + "packagingMarkedLabelAccreditationCode", + "+labels%packagingMarkedLabelAccreditationCode" + ], ], }, ], @@ -693,9 +778,13 @@ my %gs1_product_to_off = ( }, ], - ["place_of_item_activity:placeOfItemActivityModule", { + [ + "place_of_item_activity:placeOfItemActivityModule", + { fields => [ - ["placeOfProductActivity", { + [ + "placeOfProductActivity", + { fields => [ # provenanceStatement is a free text field, which can contain manufacturing places # and/or origins of ingredients and related statements, in different languages @@ -705,42 +794,44 @@ my %gs1_product_to_off = ( ], ], }, - ], - - ["referenced_file_detail_information:referencedFileDetailInformationModule", { + ], + + [ + "referenced_file_detail_information:referencedFileDetailInformationModule", + { fields => [ - ["referencedFileHeader", [ + [ + "referencedFileHeader", + [ { - match => [ - ["isPrimaryFile", "TRUE"], - ], - fields => [ - ["uniformResourceIdentifier", "image_front_url"], - ], + match => [["isPrimaryFile", "TRUE"],], + fields => [["uniformResourceIdentifier", "image_front_url"],], }, { - does_not_match => [ - ["isPrimaryFile", "TRUE"], - ], - fields => [ - ["uniformResourceIdentifier", "+image_other_url"], - ], + does_not_match => [["isPrimaryFile", "TRUE"],], + fields => [["uniformResourceIdentifier", "+image_other_url"],], }, ], ], ], }, - ], - - ["trade_item_description:tradeItemDescriptionModule", { + ], + + [ + "trade_item_description:tradeItemDescriptionModule", + { fields => [ - ["tradeItemDescriptionInformation", { + [ + "tradeItemDescriptionInformation", + { fields => [ ["descriptionShort", "abbreviated_product_name"], ["functionalName", "+categories_if_match_in_taxonomy"], ["regulatedProductName", "generic_name"], ["tradeItemDescription", "product_name"], - ["brandNameInformation", { + [ + "brandNameInformation", + { fields => [ ['brandName' => '+brands'], ['subBrand' => '+brands'], @@ -753,16 +844,20 @@ my %gs1_product_to_off = ( ], }, ], - - ["trade_item_measurements:tradeItemMeasurementsModule", { + + [ + "trade_item_measurements:tradeItemMeasurementsModule", + { fields => [ - ["tradeItemMeasurements", { + [ + "tradeItemMeasurements", + { fields => [ ["netContent", "quantity"], - ["tradeItemWeight", { - fields => [ - ["netWeight", "net_weight"], - ], + [ + "tradeItemWeight", + { + fields => [["netWeight", "net_weight"],], }, ], ], @@ -770,11 +865,15 @@ my %gs1_product_to_off = ( ], ], }, - ], - - ["trade_item_lifespan:tradeItemLifespanModule", { + ], + + [ + "trade_item_lifespan:tradeItemLifespanModule", + { fields => [ - ["tradeItemLifespan", { + [ + "tradeItemLifespan", + { fields => [ # the source can be an array if there are multiple labels ["itemPeriodSafeToUseAfterOpening", "+periods_after_opening"], @@ -790,10 +889,13 @@ my %gs1_product_to_off = ( ], }, ], - - ["tradeItemSynchronisationDates", { + + [ + "tradeItemSynchronisationDates", + { fields => [ - ["publicationDateTime", "sources_fields:org-gs1:publicationDateTime"], # Not available in CodeOnline export + ["publicationDateTime", "sources_fields:org-gs1:publicationDateTime"] + , # Not available in CodeOnline export ["lastChangeDateTime", "sources_fields:org-gs1:lastChangeDateTime"], ], }, @@ -801,7 +903,6 @@ my %gs1_product_to_off = ( ], ); - =head1 FUNCTIONS =head2 init_csv_fields () @@ -816,11 +917,10 @@ my @csv_fields = (); sub init_csv_fields() { %seen_csv_fields = (); - @csv_fields = (); + @csv_fields = (); return; } - =head2 assign_field ( $results_ref $target_field $target_value) Used to assign a value to a field, and keep track of the order of the fields we are matching, @@ -828,10 +928,10 @@ so that we can output the fields in the same order when we export a CSV. =cut -sub assign_field($results_ref, $target_field, $target_value) { - +sub assign_field ($results_ref, $target_field, $target_value) { + $results_ref->{$target_field} = $target_value; - + if (not defined $seen_csv_fields{$target_field}) { push @csv_fields, $target_field; $seen_csv_fields{$target_field} = 1; @@ -839,8 +939,7 @@ sub assign_field($results_ref, $target_field, $target_value) { return; } - -sub extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrient_detail_ref ) { +sub extract_nutrient_quantity_contained ($type, $per, $results_ref, $nid, $nutrient_detail_ref) { my $nutrient_field = $nid . $type . "_" . $per; @@ -852,7 +951,9 @@ sub extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrie # - Equadis has 2 ENER- nutrientDetail, each with a single quantityContained hash # - Agena3000 has 1 ENER- nutrientDetail with an array of 2 quantityContained # --> convert a single hash to an array with a hash - if ((defined $nutrient_detail_ref->{quantityContained}) and (ref($nutrient_detail_ref->{quantityContained}) ne "ARRAY")) { + if ( (defined $nutrient_detail_ref->{quantityContained}) + and (ref($nutrient_detail_ref->{quantityContained}) ne "ARRAY")) + { $nutrient_detail_ref->{quantityContained} = [$nutrient_detail_ref->{quantityContained}]; } @@ -867,13 +968,14 @@ sub extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrie $nutrient_unit = $gs1_maps{measurementUnitCode}{$quantity_contained_ref->{measurementUnitCode}}; } else { - $log->error("gs1_to_off - unrecognized quantity contained", - { quantityContained => $quantity_contained_ref }) if $log->is_error(); + $log->error("gs1_to_off - unrecognized quantity contained", {quantityContained => $quantity_contained_ref}) + if $log->is_error(); } # less than < modifier - if ((defined $nutrient_detail_ref->{measurementPrecisionCode}) - and ($nutrient_detail_ref->{measurementPrecisionCode} eq "LESS_THAN")) { + if ( (defined $nutrient_detail_ref->{measurementPrecisionCode}) + and ($nutrient_detail_ref->{measurementPrecisionCode} eq "LESS_THAN")) + { $nutrient_value = "< " . $nutrient_value; } @@ -893,7 +995,6 @@ sub extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrie return; } - =head2 gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) Recursive function to go through all first level keys of the $gs1_to_off_ref mapping. @@ -912,100 +1013,113 @@ The same hash reference is passed to recursive calls to the gs1_to_off function. =cut - sub gs1_to_off; sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # We should have a hash if (ref($json_ref) ne "HASH") { - $log->error("gs1_to_off - json_ref is not a hash", { gs1_to_off_ref => $gs1_to_off_ref, json_ref => $json_ref, results_ref => $results_ref }) if $log->is_error(); + $log->error("gs1_to_off - json_ref is not a hash", + {gs1_to_off_ref => $gs1_to_off_ref, json_ref => $json_ref, results_ref => $results_ref}) + if $log->is_error(); return; } - - $log->debug("gs1_to_off", { json_ref_keys => [sort keys %$json_ref] }) if $log->is_debug(); - + + $log->debug("gs1_to_off", {json_ref_keys => [sort keys %$json_ref]}) if $log->is_debug(); + # Check the matching conditions if any - + if (defined $gs1_to_off_ref->{match}) { - - $log->debug("gs1_to_off - checking conditions", { match => $gs1_to_off_ref->{match} } ) if $log->is_debug(); - + + $log->debug("gs1_to_off - checking conditions", {match => $gs1_to_off_ref->{match}}) if $log->is_debug(); + foreach my $match_field_ref (@{$gs1_to_off_ref->{match}}) { - + my $match_field = $match_field_ref->[0]; my $match_value = $match_field_ref->[1]; - - if ((not defined $json_ref->{$match_field}) - or ($json_ref->{$match_field} ne $match_value)) { - - $log->debug("gs1_to_off - condition does not match", - { match_field => $match_field, + + if ( (not defined $json_ref->{$match_field}) + or ($json_ref->{$match_field} ne $match_value)) + { + + $log->debug( + "gs1_to_off - condition does not match", + { + match_field => $match_field, match_value => $match_value, - actual_value => $json_ref->{$match_field} }) if $log->is_debug(); - + actual_value => $json_ref->{$match_field} + } + ) if $log->is_debug(); + return; - } + } } - + $log->debug("gs1_to_off - conditions match") if $log->is_debug(); } - + # Check the matching exceptions - + if (defined $gs1_to_off_ref->{does_not_match}) { - - $log->debug("gs1_to_off - checking conditions", { does_not_match => $gs1_to_off_ref->{does_not_match} } ) if $log->is_debug(); - + + $log->debug("gs1_to_off - checking conditions", {does_not_match => $gs1_to_off_ref->{does_not_match}}) + if $log->is_debug(); + my $match = 1; - + foreach my $match_field_ref (@{$gs1_to_off_ref->{does_not_match}}) { - + my $match_field = $match_field_ref->[0]; my $match_value = $match_field_ref->[1]; - - if ((not defined $json_ref->{$match_field}) - or ($json_ref->{$match_field} ne $match_value)) { - - $log->debug("gs1_to_off - condition does not match", - { match_field => $match_field, + + if ( (not defined $json_ref->{$match_field}) + or ($json_ref->{$match_field} ne $match_value)) + { + + $log->debug( + "gs1_to_off - condition does not match", + { + match_field => $match_field, match_value => $match_value, - actual_value => $json_ref->{$match_field} }) if $log->is_debug(); - + actual_value => $json_ref->{$match_field} + } + ) if $log->is_debug(); + $match = 0; last; - } + } } - + return if $match; - + $log->debug("gs1_to_off - conditions match") if $log->is_debug(); - } - + } + $log->debug("gs1_to_off - assigning fields") if $log->is_debug(); - + # If the conditions match, assign the fields foreach my $source_field_ref (@{$gs1_to_off_ref->{fields}}) { - + my $source_field = $source_field_ref->[0]; my $source_target = $source_field_ref->[1]; - - $log->debug("gs1_to_off - source fields", { source_field => $source_field }) if $log->is_debug(); - + + $log->debug("gs1_to_off - source fields", {source_field => $source_field}) if $log->is_debug(); + if (defined $json_ref->{$source_field}) { - + $log->debug("gs1_to_off - existing source fields", - { source_field => $source_field, ref => ref($source_target) }) if $log->is_debug(); - + {source_field => $source_field, ref => ref($source_target)}) + if $log->is_debug(); + # if the source field is nutrientHeader, we need to extract multiple # nutrition facts tables (for unprepared and prepared product) # with multiple nutrients. # As the mapping is complex, it is done with special code below # instead of the generic matching code. - + if ($source_field eq "nutrientHeader") { - + $log->debug("gs1_to_off - special handling for nutrientHeader array") if $log->is_debug(); # If there is only one nutrition facts table, nutrientHeader might not be an array @@ -1014,36 +1128,39 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { if (ref($json_ref->{$source_field}) eq 'HASH') { $json_ref->{$source_field} = [$json_ref->{$source_field}]; } - + # Some products like ice cream may have nutrients per 100g + nutrients per 100ml # in that case, the last values (e.g. for 100g) will override previous values (e.g. for 100ml) - + foreach my $nutrient_header_ref (@{$json_ref->{$source_field}}) { - + my $type = ""; - + if ($nutrient_header_ref->{preparationStateCode} eq "PREPARED") { $type = "_prepared"; } - + my $serving_size_value; my $serving_size_unit; my $serving_size_description; my $serving_size_description_lc; - + if (defined $nutrient_header_ref->{servingSize}{'#'}) { $serving_size_value = $nutrient_header_ref->{servingSize}{'#'}; - $serving_size_unit = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{'@'}{measurementUnitCode}}; + $serving_size_unit = $gs1_maps{measurementUnitCode} + {$nutrient_header_ref->{servingSize}{'@'}{measurementUnitCode}}; } elsif (defined $nutrient_header_ref->{servingSize}{'$t'}) { $serving_size_value = $nutrient_header_ref->{servingSize}{'$t'}; - $serving_size_unit = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{measurementUnitCode}}; + $serving_size_unit + = $gs1_maps{measurementUnitCode}{$nutrient_header_ref->{servingSize}{measurementUnitCode}}; } else { $log->error("gs1_to_off - unrecognized serving size", - { servingSize => $nutrient_header_ref->{servingSize} }) if $log->is_error(); + {servingSize => $nutrient_header_ref->{servingSize}}) + if $log->is_error(); } - + # We may have a servingSizeDescription in multiple languages, in that case, take the first one if (defined $nutrient_header_ref->{servingSizeDescription}) { @@ -1068,19 +1185,20 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { } } } - + my $per = "100g"; - + if ((defined $serving_size_value) and ($serving_size_value != 100)) { $per = "serving"; - $serving_size_value += 0; # remove extra .0 - + $serving_size_value += 0; # remove extra .0 + # Some serving sizes have an extra description # e.g. par portion : 14 g + 200 ml d'eau my $extra_serving_size_description = ""; if ((defined $serving_size_description) and (defined $serving_size_description_lc)) { # Par Portion de 30 g (2) - $serving_size_description =~ s/^(par |pour )?((1 |une )?(part |portion ))?(de )?\s*:?=?\s*//i; + $serving_size_description + =~ s/^(par |pour )?((1 |une )?(part |portion ))?(de )?\s*:?=?\s*//i; $serving_size_description =~ s/( |\d)(gr|grammes)$/$1g/i; # Par Portion de 30 g (2) : remove number of portions $serving_size_description =~ s/\(\d+\)//i; @@ -1089,93 +1207,110 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # skip the extra description if it is equal to value + unit # to avoid things like 43 g (43 g) # "Pour 45g ?²?" --> ignore bogus characters at the end - if (($serving_size_description !~ /^\s*$/) - and ($serving_size_description !~ /^$serving_size_value\s*$serving_size_unit(\?|\.|\,|\s|\*|²)*$/i)) { + if ( + ($serving_size_description !~ /^\s*$/) + and ($serving_size_description + !~ /^$serving_size_value\s*$serving_size_unit(\?|\.|\,|\s|\*|²)*$/i) + ) + { $extra_serving_size_description = ' (' . $serving_size_description . ')'; } } - - assign_field($results_ref, "serving_size", $serving_size_value . " " . $serving_size_unit . $extra_serving_size_description); + + assign_field($results_ref, "serving_size", + $serving_size_value . " " . $serving_size_unit . $extra_serving_size_description); } - + if (defined $nutrient_header_ref->{nutrientDetail}) { - + # If there's only one nutrient, we may not get an array - + if (ref($nutrient_header_ref->{nutrientDetail}) ne 'ARRAY') { - $log->error("gs1_to_off - nutrient_header is not an array ", { results_ref => $results_ref }) if $log->is_error(); + $log->error("gs1_to_off - nutrient_header is not an array ", {results_ref => $results_ref}) + if $log->is_error(); next; } - + foreach my $nutrient_detail_ref (@{$nutrient_header_ref->{nutrientDetail}}) { my $nid = $gs1_maps{nutrientTypeCode}{$nutrient_detail_ref->{nutrientTypeCode}}; - + if (defined $nid) { - extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, $nutrient_detail_ref) + extract_nutrient_quantity_contained($type, $per, $results_ref, $nid, + $nutrient_detail_ref); } else { $log->error("gs1_to_off - unrecognized nutrient", - { code => $results_ref->{code}, nutrient_detail_ref => $nutrient_detail_ref }) if $log->is_error(); + {code => $results_ref->{code}, nutrient_detail_ref => $nutrient_detail_ref}) + if $log->is_error(); my $map = "nutrientTypeCode"; my $source_value = $nutrient_detail_ref->{nutrientTypeCode}; defined $unknown_entries_in_gs1_maps{$map} or $unknown_entries_in_gs1_maps{$map} = {}; - defined $unknown_entries_in_gs1_maps{$map}{$source_value} or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; + defined $unknown_entries_in_gs1_maps{$map}{$source_value} + or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; $unknown_entries_in_gs1_maps{$map}{$source_value}++; } } } } } - - # If the value is a scalar, it is a target field (or multiple target fields) + + # If the value is a scalar, it is a target field (or multiple target fields) elsif (ref($source_target) eq "") { - - $log->debug("gs1_to_off - source field directly maps to target field", - { source_field => $source_field, target_field => $source_target }) if $log->is_debug(); - + + $log->debug( + "gs1_to_off - source field directly maps to target field", + {source_field => $source_field, target_field => $source_target} + ) if $log->is_debug(); + # We may have multiple source values, in an array - + my @source_values; - + if (ref($json_ref->{$source_field}) eq "ARRAY") { @source_values = @{$json_ref->{$source_field}}; } else { @source_values = ($json_ref->{$source_field}); } - + foreach my $source_value (@source_values) { - + # We may have multiple target fields, separated by commas foreach my $target_field (split(/\s*,\s*/, $source_target)) { - - $log->debug("gs1_to_off - assign value to target field", - { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_debug(); - + + $log->debug( + "gs1_to_off - assign value to target field", + { + source_field => $source_field, + source_value => $source_value, + target_field => $target_field + } + ) if $log->is_debug(); + # We might combine a value and a unit, but also keep them separate so that we can assign fields like quantity_value and quantity_unit my $source_value_value; my $source_value_unit; - + # Some fields indicate a language: - - # ingredientStatement: { - # languageCode: "fr", - # $t: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium." - # }, + + # ingredientStatement: { + # languageCode: "fr", + # $t: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium." + # }, # or another format (depending on how the XML was converted to JSON): - - # ingredientStatement: { - # #: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium.", - # @: { - # languageCode: "fr" - # } - # }, + + # ingredientStatement: { + # #: "Ingrédients: LAIT entier en poudre (38,9%), PETIT-LAIT filtré en poudre, café soluble (8,0%), fibres de chicorée (oligofructose) (8%), chicorée soluble (7,5%), stabilisant : E331, correcteur d'acidité : E340, sulfate de magnésium.", + # @: { + # languageCode: "fr" + # } + # }, if (ref($source_value) eq "HASH") { my $language_code; my $value; - + # There may be a language code if (defined $source_value->{languageCode}) { $language_code = $source_value->{languageCode}; @@ -1183,26 +1318,27 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { elsif ((defined $source_value->{'@'}) and (defined $source_value->{'@'}{languageCode})) { $language_code = $source_value->{'@'}{languageCode}; } - + # Keep track of language codes so that we can assign the lc and lang fields if (defined $language_code) { defined $results_ref->{languages} or $results_ref->{languages} = {}; - defined $results_ref->{languages}{$language_code} or $results_ref->{languages}{$language_code} = 0; + defined $results_ref->{languages}{$language_code} + or $results_ref->{languages}{$language_code} = 0; $results_ref->{languages}{$language_code}++; } - + if (defined $source_value->{'$t'}) { $value = $source_value->{'$t'}; } elsif (defined $source_value->{'#'}) { $value = $source_value->{'#'}; } - + # There may be a measurement unit code, or a time measurement unit code # in that case, concatenate it to the value - + foreach my $code ("measurementUnitCode", "timeMeasurementUnitCode") { - + if (defined $source_value->{$code}) { $source_value_value = $value; $source_value_unit = $gs1_maps{$code}{$source_value->{$code}}; @@ -1214,30 +1350,45 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { $value .= " " . $gs1_maps{$code}{$source_value->{'@'}{$code}}; } } - + # If the field is a language specific field, we can assign the value to the language specific field if ((defined $language_code) and (defined $language_fields{$target_field})) { $target_field = $target_field . "_" . lc($language_code); - $log->debug("gs1_to_off - changed to language specific target field", - { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_debug(); + $log->debug( + "gs1_to_off - changed to language specific target field", + { + source_field => $source_field, + source_value => $source_value, + target_field => $target_field + } + ) if $log->is_debug(); } - + if (defined $value) { $source_value = $value; } else { - $log->error("gs1_to_off - issue with source value structure", - { source_field => $source_field, source_value => $source_value, target_field => $target_field }) if $log->is_error(); + $log->error( + "gs1_to_off - issue with source value structure", + { + source_field => $source_field, + source_value => $source_value, + target_field => $target_field + } + ) if $log->is_error(); $source_value = undef; } } - - if ((defined $source_value) and ($source_value ne "") + + if ( + (defined $source_value) + and ($source_value ne "") # CodeOnline sometimes has empty values '.' or '0' for partyName for one of the fields brandOwner or informationProviderOfTradeItem # ignore them in order to keep the partyName value from the other fields - and not (($source_field eq "partyName") and (length($source_value) < 2)) - ) { - + and not(($source_field eq "partyName") and (length($source_value) < 2)) + ) + { + # allergenTypeCode => '+traces%allergens', # % sign means we will use a map to transform the source value if ($target_field =~ /\%/) { @@ -1247,28 +1398,38 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { $source_value = $gs1_maps{$map}{$source_value}; } else { - $log->error("gs1_to_off - unknown source value for map", - { code => $results_ref->{code}, source_field => $source_field, source_value => $source_value, target_field => $target_field, map => $map }) if $log->is_error(); - defined $unknown_entries_in_gs1_maps{$map} or $unknown_entries_in_gs1_maps{$map} = {}; - defined $unknown_entries_in_gs1_maps{$map}{$source_value} or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; + $log->error( + "gs1_to_off - unknown source value for map", + { + code => $results_ref->{code}, + source_field => $source_field, + source_value => $source_value, + target_field => $target_field, + map => $map + } + ) if $log->is_error(); + defined $unknown_entries_in_gs1_maps{$map} + or $unknown_entries_in_gs1_maps{$map} = {}; + defined $unknown_entries_in_gs1_maps{$map}{$source_value} + or $unknown_entries_in_gs1_maps{$map}{$source_value} = 0; $unknown_entries_in_gs1_maps{$map}{$source_value}++; # Skip the entry next; } - } - + } + # allergenTypeCode => '+traces%allergens', # + sign means we will create a comma separated list if we have multiple values if ($target_field =~ /^\+/) { $target_field = $'; - + if (defined $results_ref->{$target_field}) { $source_value = $results_ref->{$target_field} . ', ' . $source_value; } } - + assign_field($results_ref, $target_field, $source_value); - + if ($target_field eq "quantity") { if (defined $source_value_value) { assign_field($results_ref, $target_field . "_value", $source_value_value); @@ -1277,53 +1438,53 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { } } } - + } } - + elsif (ref($source_target) eq "ARRAY") { - -# http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 -# source_field => array of hashes: go down one level, expect an array -# -# ["tradeItemContactInformation", [ -# { -# # match => hash of key value conditions: assign values to field only if the conditions match -# match => [ -# ["contactTypeCode", "CXC"], -# ], -# fields => [ -# ["contactAddress", "customer_service_fr"], -# ], -# }, -# ], -# ], - - $log->debug("gs1_to_off - array field", { source_field => $source_field }) if $log->is_debug(); - + + # http://apps.gs1.org/GDD/Pages/clDetails.aspx?semanticURN=urn:gs1:gdd:cl:ContactTypeCode&release=4 + # source_field => array of hashes: go down one level, expect an array + # + # ["tradeItemContactInformation", [ + # { + # # match => hash of key value conditions: assign values to field only if the conditions match + # match => [ + # ["contactTypeCode", "CXC"], + # ], + # fields => [ + # ["contactAddress", "customer_service_fr"], + # ], + # }, + # ], + # ], + + $log->debug("gs1_to_off - array field", {source_field => $source_field}) if $log->is_debug(); + # Loop through the array entries of the GS1 to OFF mapping - + foreach my $gs1_to_off_array_entry_ref (@{$source_target}) { - + # Loop through the array entries of the JSON file - + # If the source file is not an array, create it # (e.g. if only one element is there, the xml to json conversion might not create an array) if (ref($json_ref->{$source_field}) ne "ARRAY") { - $json_ref->{$source_field} = [ $json_ref->{$source_field} ]; + $json_ref->{$source_field} = [$json_ref->{$source_field}]; } - + foreach my $json_array_entry_ref (@{$json_ref->{$source_field}}) { gs1_to_off($gs1_to_off_array_entry_ref, $json_array_entry_ref, $results_ref); } } - } - + } + elsif (ref($source_target) eq "HASH") { - + # Go down one level - + # The source structure may be a hash or an array of hashes # e.g. Equadis: allergenRelatedInformation is a hash, CodeOnline: it is an array @@ -1339,13 +1500,14 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # { # allergenTypeCode: "AE", # levelOfContainmentCode: "CONTAINS" - # }, + # }, $log->debug("gs1_to_off - source_target is a hash", - { source_field => $source_field, source_target => $source_target, json_ref => $json_ref }) if $log->is_debug(); - + {source_field => $source_field, source_target => $source_target, json_ref => $json_ref}) + if $log->is_debug(); + if (ref($json_ref->{$source_field}) eq "HASH") { - + gs1_to_off($source_target, $json_ref->{$source_field}, $results_ref); } elsif (ref($json_ref->{$source_field}) eq "ARRAY") { @@ -1356,13 +1518,20 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { # allergenRelatedInformation: [ # [ ] # ] - + if (ref($json_array_entry_ref) eq "HASH") { gs1_to_off($source_target, $json_array_entry_ref, $results_ref); } else { - $log->debug("gs1_to_off - expected a hash but got an array", - { source_field => $source_field, source_target => $source_target, json_ref => $json_ref, json_array_entry_ref => $json_array_entry_ref }) if $log->is_debug(); + $log->debug( + "gs1_to_off - expected a hash but got an array", + { + source_field => $source_field, + source_target => $source_target, + json_ref => $json_ref, + json_array_entry_ref => $json_array_entry_ref + } + ) if $log->is_debug(); } } } @@ -1372,7 +1541,6 @@ sub gs1_to_off ($gs1_to_off_ref, $json_ref, $results_ref) { return; } - =head2 convert_single_text_property_to_direct_value ($json ) There are different ways to convert a XML document to a JSON data structure. @@ -1406,16 +1574,18 @@ gtin: "03449865355608" =cut +sub convert_single_text_property_to_direct_value ($json_ref) { -sub convert_single_text_property_to_direct_value($json_ref) { - - my $type = ref $json_ref or return; + my $type = ref $json_ref or return; - if ($type eq 'HASH') { + if ($type eq 'HASH') { foreach my $key (keys %$json_ref) { if (ref $json_ref->{$key}) { # Hash with a single $t value? - if ((ref $json_ref->{$key} eq 'HASH') and ((scalar keys %{$json_ref->{$key}}) == 1) and (defined $json_ref->{$key}{'$t'})) { + if ( (ref $json_ref->{$key} eq 'HASH') + and ((scalar keys %{$json_ref->{$key}}) == 1) + and (defined $json_ref->{$key}{'$t'})) + { $json_ref->{$key} = $json_ref->{$key}{'$t'}; } else { @@ -1423,18 +1593,17 @@ sub convert_single_text_property_to_direct_value($json_ref) { } } } - } - elsif ($type eq 'ARRAY') { - - foreach my $elem (@$json_ref) { - if (ref $elem) { - convert_single_text_property_to_direct_value($elem); - } - } - } - return; -} + } + elsif ($type eq 'ARRAY') { + foreach my $elem (@$json_ref) { + if (ref $elem) { + convert_single_text_property_to_direct_value($elem); + } + } + } + return; +} =head2 convert_gs1_json_message_to_off_products_csv_fields ($json, $products_ref, $messages_ref) @@ -1463,9 +1632,8 @@ Each message will be added as one element (a hash ref) of the messages data arra =cut +sub convert_gs1_json_message_to_off_products_csv ($json_ref, $products_ref, $messages_ref) { -sub convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $messages_ref) { - # Depending on how the original XML was converted to JSON, # text values of XML tags can be assigned directly as the value of the corresponding key # or they can be stored inside a hash with the $t key @@ -1473,10 +1641,10 @@ sub convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $mess # levelOfContainmentCode: { # $t: "MAY_CONTAIN" # }, - + # The JSON can contain only the product information "tradeItem" level # or the tradeItem can be encapsulated in a message - + # catalogue_item_notification:catalogueItemNotificationMessage # - transaction # -- documentCommand @@ -1490,62 +1658,69 @@ sub convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $mess my $message_ref = {}; gs1_to_off(\%gs1_message_to_off, $json_ref, $message_ref); push @$messages_ref, $message_ref; - $log->debug("convert_gs1_json_to_off_csv - GS1 message fields", { message_ref => $message_ref }) if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - GS1 message fields", {message_ref => $message_ref}) + if $log->is_debug(); } - - foreach my $field (qw( + + foreach my $field ( + qw( catalogue_item_notification:catalogueItemNotificationMessage transaction documentCommand catalogue_item_notification:catalogueItemNotification catalogueItem - )) { + ) + ) + { if (defined $json_ref->{$field}) { $json_ref = $json_ref->{$field}; - $log->debug("convert_gs1_json_to_off_csv - remove encapsulating field", { field => $field }) if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - remove encapsulating field", {field => $field}) + if $log->is_debug(); } } # A product can contain a child product my $child_product_json_ref = deep_get($json_ref, qw(catalogueItemChildItemLink catalogueItem)); if (defined $child_product_json_ref) { - $log->debug("convert_gs1_json_to_off_csv - found a child item", { }) if $log->is_debug(); - convert_gs1_json_message_to_off_products_csv($child_product_json_ref, $products_ref, $messages_ref) + $log->debug("convert_gs1_json_to_off_csv - found a child item", {}) if $log->is_debug(); + convert_gs1_json_message_to_off_products_csv($child_product_json_ref, $products_ref, $messages_ref); } if (defined $json_ref->{tradeItem}) { $json_ref = $json_ref->{tradeItem}; } - + if (not defined $json_ref->{gtin}) { - $log->debug("convert_gs1_json_to_off_csv - no gtin - skipping", { json_ref => $json_ref }) if $log->is_debug(); + $log->debug("convert_gs1_json_to_off_csv - no gtin - skipping", {json_ref => $json_ref}) if $log->is_debug(); return {}; } - + if ((not defined $json_ref->{isTradeItemAConsumerUnit}) or ($json_ref->{isTradeItemAConsumerUnit} ne "true")) { - $log->debug("convert_gs1_json_to_off_csv - isTradeItemAConsumerUnit not true - skipping", - { isTradeItemAConsumerUnit => $json_ref->{isTradeItemAConsumerUnit} }) if $log->is_debug(); + $log->debug( + "convert_gs1_json_to_off_csv - isTradeItemAConsumerUnit not true - skipping", + {isTradeItemAConsumerUnit => $json_ref->{isTradeItemAConsumerUnit}} + ) if $log->is_debug(); return {}; } - + my $product_ref = {}; - + gs1_to_off(\%gs1_product_to_off, $json_ref, $product_ref); - + # assign the lang and lc fields if (defined $product_ref->{languages}) { - my @sorted_languages = sort ( { $product_ref->{languages}{$b} <=> $product_ref->{languages}{$a} } keys %{$product_ref->{languages}}); + my @sorted_languages = sort ({$product_ref->{languages}{$b} <=> $product_ref->{languages}{$a}} + keys %{$product_ref->{languages}}); my $top_language = $sorted_languages[0]; $product_ref->{lc} = $top_language; $product_ref->{lang} = $top_language; delete $product_ref->{languages}; } - + push @$products_ref, $product_ref; return; } - =head2 read_gs1_json_file ($json_file, $products_ref, $messages_ref) Read a GS1 message file in json format, convert the included products in the OFF format, @@ -1564,12 +1739,12 @@ The encapsulating GS1 message is added to the $messages_ref array =cut -sub read_gs1_json_file($json_file, $products_ref, $messages_ref) { +sub read_gs1_json_file ($json_file, $products_ref, $messages_ref) { + + $log->debug("read_gs1_json_file", {json_file => $json_file}) if $log->is_debug(); - $log->debug("read_gs1_json_file", { json_file => $json_file }) if $log->is_debug(); - - open (my $in, "<", $json_file) or die("Cannot open json file $json_file : $!\n"); - my $json = join (q{}, (<$in>)); + open(my $in, "<", $json_file) or die("Cannot open json file $json_file : $!\n"); + my $json = join(q{}, (<$in>)); close($in); my $json_ref = decode_json($json); @@ -1578,22 +1753,20 @@ sub read_gs1_json_file($json_file, $products_ref, $messages_ref) { # to the format generated by the nodejs xml2json module # which is the expected format of the ProductOpener::GS1 module convert_single_text_property_to_direct_value($json_ref); - + convert_gs1_json_message_to_off_products_csv($json_ref, $products_ref, $messages_ref); return; } - sub generate_gs1_message_identifier() { # local GLN + 60 random hexadecimal characters my $identifier = deep_get(\%options, qw(gs1 local_gln)) . "_"; - $identifier .= sprintf("%x", rand 16) for 1..60; + $identifier .= sprintf("%x", rand 16) for 1 .. 60; return $identifier; } - =head2 generate_gs1_confirmation_message ($notification_message_ref, $timestamp) GS1 data pools (catalogs) send us GSDN Catalogue Item Notification (CIN) which are messages @@ -1618,7 +1791,7 @@ generate test confirmation messages which don't have a different content every t =cut -sub generate_gs1_confirmation_message($notification_message_ref, $timestamp) { +sub generate_gs1_confirmation_message ($notification_message_ref, $timestamp) { # We will need to generate a message identifier, put it in the XML content, # and return it as it is used as the file name @@ -1641,19 +1814,19 @@ sub generate_gs1_confirmation_message($notification_message_ref, $timestamp) { # Include the notification data in the template data for the confirmation $confirmation_data_ref->{notification} = $notification_message_ref; - my $xml; if (process_template('gs1/catalogue_item_confirmation.tt.xml', $confirmation_data_ref, \$xml)) { - $log->debug("generate_gs1_confirmation_message - success", { confirmation_instance_identifier => $confirmation_instance_identifier}) if $log->is_error(); + $log->debug("generate_gs1_confirmation_message - success", + {confirmation_instance_identifier => $confirmation_instance_identifier}) + if $log->is_error(); } else { - $log->error("generate_gs1_confirmation_message - template error", { error => $tt->error() }) if $log->is_error(); + $log->error("generate_gs1_confirmation_message - template error", {error => $tt->error()}) if $log->is_error(); } return ($confirmation_instance_identifier, $xml); } - =head2 write_off_csv_file ($csv_file, $products_ref) Write all product data from the $products_ref array to a CSV file in OFF format. @@ -1666,34 +1839,38 @@ Write all product data from the $products_ref array to a CSV file in OFF format. =cut -sub write_off_csv_file($csv_file, $products_ref) { +sub write_off_csv_file ($csv_file, $products_ref) { + + $log->debug("write_off_csv_file", {csv_file => $csv_file}) if $log->is_debug(); - $log->debug("write_off_csv_file", { csv_file => $csv_file }) if $log->is_debug(); - open(my $filehandle, ">:encoding(UTF-8)", $csv_file) or die("Cannot write csv file $csv_file : $!\n"); - + my $separator = "\t"; - - my $csv = Text::CSV->new ( { binary => 1 , sep_char => $separator } ) # should set binary attribute. - or die "Cannot use CSV: ".Text::CSV->error_diag (); + + my $csv = Text::CSV->new({binary => 1, sep_char => $separator}) # should set binary attribute. + or die "Cannot use CSV: " . Text::CSV->error_diag(); # Print the header line with fields names - - $log->debug("write_off_csv_file - header", { csv_fields => \@csv_fields }) if $log->is_debug(); - - $csv->print ($filehandle, \@csv_fields); + + $log->debug("write_off_csv_file - header", {csv_fields => \@csv_fields}) if $log->is_debug(); + + $csv->print($filehandle, \@csv_fields); print $filehandle "\n"; - + # We may have the same product multiple times, sort by sources_fields:org-gs1:publicationDateTime # or by lastChangeDateTime (publicationDateTime is not in the CodeOnline export) my %seen_products = (); - + foreach my $product_ref ( - sort {($b->{"sources_fields:org-gs1:publicationDateTime"} // $b->{"sources_fields:org-gs1:lastChangeDateTime"}) - cmp ($a->{"sources_fields:org-gs1:publicationDateTime"} // $a->{"sources_fields:org-gs1:lastChangeDateTime"}) } - @$products_ref) { - - $log->debug("write_off_csv_file - product", { code => $product_ref->{code} }) if $log->is_debug(); + sort { + ($b->{"sources_fields:org-gs1:publicationDateTime"} // $b->{"sources_fields:org-gs1:lastChangeDateTime"}) + cmp($a->{"sources_fields:org-gs1:publicationDateTime"} + // $a->{"sources_fields:org-gs1:lastChangeDateTime"}) + } @$products_ref + ) + { + + $log->debug("write_off_csv_file - product", {code => $product_ref->{code}}) if $log->is_debug(); if (defined $seen_products{$product_ref->{code}}) { # Skip product for which we have a more recent publication next; @@ -1701,21 +1878,20 @@ sub write_off_csv_file($csv_file, $products_ref) { else { $seen_products{$product_ref->{code}} = 1; } - + my @csv_fields_values = (); foreach my $field (@csv_fields) { push @csv_fields_values, $product_ref->{$field}; } - - $csv->print ($filehandle, \@csv_fields_values); + + $csv->print($filehandle, \@csv_fields_values); print $filehandle "\n"; } - + close $filehandle; return; } - =head2 print_unknown_entries_in_gs1_maps () Prints the entries for GS1 data types for which we do not have a corresponding OFF match, @@ -1724,22 +1900,24 @@ ordered by the number of occurrences in the GS1 data =cut sub print_unknown_entries_in_gs1_maps() { - + my $unknown_entries = 0; - + foreach my $map (sort keys %unknown_entries_in_gs1_maps) { print "$map map has unknown entries:\n"; - - foreach my $source_value - (sort { $unknown_entries_in_gs1_maps{$map}{$a} <=> $unknown_entries_in_gs1_maps{$map}{$b} } - keys %{$unknown_entries_in_gs1_maps{$map}}) { + + foreach my $source_value ( + sort {$unknown_entries_in_gs1_maps{$map}{$a} <=> $unknown_entries_in_gs1_maps{$map}{$b}} + keys %{$unknown_entries_in_gs1_maps{$map}} + ) + { print $source_value . "\t" . $unknown_entries_in_gs1_maps{$map}{$source_value} . "\n"; $unknown_entries++; } - + print "\n"; } - + return $unknown_entries; } diff --git a/lib/ProductOpener/ImportConvert.pm b/lib/ProductOpener/ImportConvert.pm index 9df089ae1b62d..b44b9027be0e8 100644 --- a/lib/ProductOpener/ImportConvert.pm +++ b/lib/ProductOpener/ImportConvert.pm @@ -44,16 +44,15 @@ convert the product data they contain to a format that can be imported on Open F package ProductOpener::ImportConvert; use ProductOpener::PerlStandards; -use Exporter qw< import >; +use Exporter qw< import >; use Log::Any qw($log); use Storable qw(dclone); use Text::Fuzzy; -BEGIN -{ - use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); +BEGIN { + use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS); @EXPORT_OK = qw( %fields @@ -91,11 +90,11 @@ BEGIN @xml_errors - ); # symbols to export on request + ); # symbols to export on request %EXPORT_TAGS = (all => [@EXPORT_OK]); } -use vars @EXPORT_OK ; +use vars @EXPORT_OK; use ProductOpener::Config qw/:all/; use ProductOpener::Store qw/:all/; @@ -126,7 +125,7 @@ my $mode = "append"; =cut -sub get_or_create_product_for_code($code) { +sub get_or_create_product_for_code ($code) { if (not defined $code) { die("Undefined code $code"); @@ -146,7 +145,7 @@ sub get_or_create_product_for_code($code) { return $products{$code}; } -sub assign_value($product_ref, $target, $value) { +sub assign_value ($product_ref, $target, $value) { my $field = $target; @@ -177,8 +176,11 @@ sub assign_value($product_ref, $target, $value) { $value =~ s/(\.|\,)(\d*[1-9])0+$/$1$2/; } - if ((defined $product_ref->{$field}) and ($product_ref->{$field} ne "") and ($mode eq "append") - and ($product_ref->{$field} ne $value)) { + if ( (defined $product_ref->{$field}) + and ($product_ref->{$field} ne "") + and ($mode eq "append") + and ($product_ref->{$field} ne $value)) + { if (exists $tags_fields{$field}) { if ($target =~ /^!/) { @@ -208,8 +210,7 @@ sub assign_value($product_ref, $target, $value) { return; } - -sub remove_value($product_ref, $target, $value) { +sub remove_value ($product_ref, $target, $value) { my $field = $target; @@ -220,12 +221,10 @@ sub remove_value($product_ref, $target, $value) { return; } - -sub apply_global_params($product_ref) { +sub apply_global_params ($product_ref) { $mode = "append"; - foreach my $field (sort keys %global_params) { assign_value($product_ref, $field, $global_params{$field}); @@ -236,13 +235,15 @@ sub apply_global_params($product_ref) { # some producers send us data for products in different languages sold in different markets -sub assign_main_language_of_product($product_ref, $lcs_ref, $default_lc) { +sub assign_main_language_of_product ($product_ref, $lcs_ref, $default_lc) { if ((not defined $product_ref->{lc}) or (not defined $product_ref->{"product_name_" . $product_ref->{lc}})) { foreach my $possible_lc (@{$lcs_ref}) { - if ((defined $product_ref->{"product_name_" . $possible_lc}) and ($product_ref->{"product_name_" . $possible_lc} !~ /^\s*$/)) { - $log->info("assign_main_language_of_product: assigning value", { lc => $possible_lc}) if $log->is_info(); + if ( (defined $product_ref->{"product_name_" . $possible_lc}) + and ($product_ref->{"product_name_" . $possible_lc} !~ /^\s*$/)) + { + $log->info("assign_main_language_of_product: assigning value", {lc => $possible_lc}) if $log->is_info(); assign_value($product_ref, "lc", $possible_lc); last; } @@ -250,34 +251,37 @@ sub assign_main_language_of_product($product_ref, $lcs_ref, $default_lc) { } if (not defined $product_ref->{lc}) { - $log->info("assign_main_language_of_product: assigning default value", { lc => $default_lc}) if $log->is_info(); + $log->info("assign_main_language_of_product: assigning default value", {lc => $default_lc}) if $log->is_info(); assign_value($product_ref, "lc", $default_lc); } return; } -sub assign_countries_for_product($product_ref, $lcs_ref, $default_country) { +sub assign_countries_for_product ($product_ref, $lcs_ref, $default_country) { foreach my $possible_lc (sort keys %{$lcs_ref}) { if (defined $product_ref->{"product_name_" . $possible_lc}) { - assign_value($product_ref,"countries", $lcs_ref->{$possible_lc}); - $log->info("assign_countries_for_product: found lc - assigning value", { lc => $possible_lc, countries => $lcs_ref->{$possible_lc}}) if $log->is_info(); + assign_value($product_ref, "countries", $lcs_ref->{$possible_lc}); + $log->info( + "assign_countries_for_product: found lc - assigning value", + {lc => $possible_lc, countries => $lcs_ref->{$possible_lc}} + ) if $log->is_info(); } } if ((not defined $product_ref->{countries}) or ($product_ref->{countries} eq "")) { - assign_value($product_ref,"countries", $default_country); - $log->info("assign_countries_for_product: assigning default value", { countries => $default_country}) if $log->is_info(); + assign_value($product_ref, "countries", $default_country); + $log->info("assign_countries_for_product: assigning default value", {countries => $default_country}) + if $log->is_info(); } return; } - # Match all tags that exist in a taxonomy. Needs the input field to be split, so there must be separators. -sub match_taxonomy_tags($product_ref, $source, $target, $options_ref) { +sub match_taxonomy_tags ($product_ref, $source, $target, $options_ref) { # logo ab # logo bio européen : nl-bio-01 agriculture pays bas 1 @@ -292,7 +296,9 @@ sub match_taxonomy_tags($product_ref, $source, $target, $options_ref) { if ((defined $product_ref->{$source}) and ($product_ref->{$source} ne "")) { - $log->trace("match_taxonomy_tags: init", { source => $source, value => $product_ref->{$source}, target => $target}) if $log->is_trace(); + $log->trace("match_taxonomy_tags: init", + {source => $source, value => $product_ref->{$source}, target => $target}) + if $log->is_trace(); my @values = ($product_ref->{$source}); if ((defined $options_ref) and (defined $options_ref->{split}) and ($options_ref->{split} ne "")) { @@ -312,20 +318,26 @@ sub match_taxonomy_tags($product_ref, $source, $target, $options_ref) { $value =~ s/\s+$//; my $canon_tag = canonicalize_taxonomy_tag($product_ref->{lc}, $target, $value); - $log->trace("match_taxonomy_tags: split value", { value => $value, canon_tag => $canon_tag}) if $log->is_trace(); - + $log->trace("match_taxonomy_tags: split value", {value => $value, canon_tag => $canon_tag}) + if $log->is_trace(); if (exists_taxonomy_tag($target, $canon_tag)) { assign_value($product_ref, $target, $canon_tag); - $log->info("match_taxonomy_tags: assigning value", { source => $source, value => $canon_tag, target => $target}) if $log->is_info(); + $log->info("match_taxonomy_tags: assigning value", + {source => $source, value => $canon_tag, target => $target}) + if $log->is_info(); } # try to see if we have a packager code # e.g. from Carrefour: Fabriqué en France par EMB 29181 (F) ou EMB 86092A (G) pour Interdis. elsif (($value =~ /^((e|emb)(\s|-|\.)*(\d{5})(\s|-|\.)*(\w)?)$/i) - or ($value =~ /([a-z][a-z])(\s|\.|-)+\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+(ce|ec|eg)/i)) { - assign_value($product_ref,"emb_codes", $value); - $log->info("match_taxonomy_tags: found packaging code - assigning value", { source => $source, value => $value, target => "emb_codes"}) if $log->is_info(); + or ($value =~ /([a-z][a-z])(\s|\.|-)+\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+\d\d\d(\s|\.|-)+(ce|ec|eg)/i)) + { + assign_value($product_ref, "emb_codes", $value); + $log->info( + "match_taxonomy_tags: found packaging code - assigning value", + {source => $source, value => $value, target => "emb_codes"} + ) if $log->is_info(); } } } @@ -333,20 +345,28 @@ sub match_taxonomy_tags($product_ref, $source, $target, $options_ref) { return; } - # Match only specific tags (e.g. "organic" + "label rouge" in product name) -sub match_specific_taxonomy_tags($product_ref, $source, $target, $tags_ref) { +sub match_specific_taxonomy_tags ($product_ref, $source, $target, $tags_ref) { my $tag_lc = $product_ref->{lc}; - $log->trace("match_specific_taxonomy_tags - start", { source => $source, source_value => $product_ref->{$source}, target => $target, tag_lc => $tag_lc, tags_ref => $tags_ref}) if $log->is_trace(); + $log->trace( + "match_specific_taxonomy_tags - start", + { + source => $source, + source_value => $product_ref->{$source}, + target => $target, + tag_lc => $tag_lc, + tags_ref => $tags_ref + } + ) if $log->is_trace(); if ((defined $product_ref->{$source}) and ($product_ref->{$source} ne "")) { foreach my $tagid (@{$tags_ref}) { - $log->trace("match_specific_taxonomy_tags - looping through tags", { tagid => $tagid}) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - looping through tags", {tagid => $tagid}) if $log->is_trace(); if (defined $translations_to{$target}{$tagid}{$tag_lc}) { @@ -361,11 +381,11 @@ sub match_specific_taxonomy_tags($product_ref, $source, $target, $tags_ref) { } my $tag_regexp = ""; - foreach my $synonym (sort { length($b) <=> length($a) } @synonyms) { + foreach my $synonym (sort {length($b) <=> length($a)} @synonyms) { # simple singulars and plurals my $singular = $synonym; $synonym =~ s/s$//; - $tag_regexp .= '|' . $synonym . '|' . $synonym . 's' ; + $tag_regexp .= '|' . $synonym . '|' . $synonym . 's'; my $unaccented_synonym = unac_string_perl($synonym); if ($unaccented_synonym ne $synonym) { @@ -375,16 +395,18 @@ sub match_specific_taxonomy_tags($product_ref, $source, $target, $tags_ref) { } $tag_regexp =~ s/^\|//; - $log->trace("match_specific_taxonomy_tags - regexp", { tag_regexp => $tag_regexp}) if $log->is_trace(); - $log->trace("match_specific_taxonomy_tags - source value", { source_value => $product_ref->{$source}}) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - regexp", {tag_regexp => $tag_regexp}) if $log->is_trace(); + $log->trace("match_specific_taxonomy_tags - source value", {source_value => $product_ref->{$source}}) + if $log->is_trace(); if ($product_ref->{$source} =~ /\b(${tag_regexp})\b/i) { $log->info( "match_specific_taxonomy_tags: assigning value", - { matching => $1, - source => $source, - value => $tagid, - target => $target + { + matching => $1, + source => $source, + value => $tagid, + target => $target } ) if $log->is_info(); assign_value($product_ref, $target, $tagid); @@ -396,7 +418,7 @@ sub match_specific_taxonomy_tags($product_ref, $source, $target, $tags_ref) { return; } -sub match_labels_in_product_name($product_ref) { +sub match_labels_in_product_name ($product_ref) { my $tag_lc = $product_ref->{lc}; @@ -412,12 +434,12 @@ sub match_labels_in_product_name($product_ref) { return; } - -sub split_allergens($allergens) { +sub split_allergens ($allergens) { # simple allergen (not an enumeration) -> return _$allergens_ - if (($allergens !~ /,/) - and (not ($allergens =~ / (et|and) /i))) { + if ( ($allergens !~ /,/) + and (not($allergens =~ / (et|and) /i))) + { return "_" . $allergens . "_"; } else { @@ -425,8 +447,6 @@ sub split_allergens($allergens) { } } - - =head2 assign_quantity_from_field ( $product_ref, $field ) Look for a quantity in a field like a product name. @@ -434,11 +454,15 @@ Assign it to the quantity and remove it from the field. =cut -sub assign_quantity_from_field($product_ref, $field) { +sub assign_quantity_from_field ($product_ref, $field) { - if ((defined $product_ref->{$field}) and ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq ""))) { + if ( (defined $product_ref->{$field}) + and ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq ""))) + { - if ($product_ref->{$field} =~ /\b\(?((\d+)\s?x\s?)?(\d+\.?\,?\d*)\s?(g|gr|kg|kgr|l|cl|ml|dl)\s?(x\s?(\d+))?\)?\s*$/i) { + if ($product_ref->{$field} + =~ /\b\(?((\d+)\s?x\s?)?(\d+\.?\,?\d*)\s?(g|gr|kg|kgr|l|cl|ml|dl)\s?(x\s?(\d+))?\)?\s*$/i) + { my $before = $`; @@ -446,9 +470,7 @@ sub assign_quantity_from_field($product_ref, $field) { # e.g. Barres de Céréales (8+4) x 25g # if we have a single x or a * before, skip - if (not ( - ($before =~ /(\sx|\*)\s*$/i) - )) { + if (not(($before =~ /(\sx|\*)\s*$/i))) { $product_ref->{$field} = $before; @@ -471,7 +493,6 @@ sub assign_quantity_from_field($product_ref, $field) { return; } - =head2 remove_quantity_from_field ( $product_ref, $field ) Look for the quantity in a field like a product name. @@ -479,14 +500,14 @@ If found, remove it from the field. =cut -sub remove_quantity_from_field($product_ref, $field) { +sub remove_quantity_from_field ($product_ref, $field) { if (defined $product_ref->{$field}) { - + my $quantity = $product_ref->{quantity}; my $quantity_value = $product_ref->{quantity_value}; my $quantity_unit = $product_ref->{quantity_unit}; - + if (defined $quantity) { $quantity =~ s/\(/\\\(/g; $quantity =~ s/\)/\\\)/g; @@ -496,18 +517,23 @@ sub remove_quantity_from_field($product_ref, $field) { $product_ref->{$field} = $`; } } - elsif ((defined $quantity_value) and (defined $quantity_unit) and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value $quantity_unit\)?\s*$/i)) { + elsif ( (defined $quantity_value) + and (defined $quantity_unit) + and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value $quantity_unit\)?\s*$/i)) + { $product_ref->{$field} = $`; } - elsif ((defined $quantity_value) and (defined $quantity_unit) and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value$quantity_unit\)?\s*$/i)) { + elsif ( (defined $quantity_value) + and (defined $quantity_unit) + and ($product_ref->{$field} =~ /\s*\b\(?$quantity_value$quantity_unit\)?\s*$/i)) + { $product_ref->{$field} = $`; - } + } } return; } - -sub clean_weights($product_ref) { +sub clean_weights ($product_ref) { # normalize weights @@ -533,8 +559,9 @@ sub clean_weights($product_ref) { # we can be passed values in a specific unit (e.g. quantity_in_mg) if (not defined $product_ref->{$field}) { foreach my $u ('kg', 'g', 'mg', 'mcg', 'l', 'dl', 'cl', 'ml') { - if ((defined $product_ref->{$field . "_value_in_" . $u}) - and ($product_ref->{$field . "_value_in_" . $u} ne "")) { + if ( (defined $product_ref->{$field . "_value_in_" . $u}) + and ($product_ref->{$field . "_value_in_" . $u} ne "")) + { assign_value($product_ref, $field . "_value", $product_ref->{$field . "_value_in_" . $u}); assign_value($product_ref, $field . "_unit", $u); last; @@ -543,20 +570,24 @@ sub clean_weights($product_ref) { } # if we have a value but no unit, assume the unit is grams for weights, if the value is greater than 20 and less than 5000 - if ((defined $product_ref->{$field . "_value"}) + if ( + (defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") - and ((not defined $product_ref->{$field . "_unit"}) + and ( (not defined $product_ref->{$field . "_unit"}) or ($product_ref->{$field . "_unit"} eq "")) and ($product_ref->{$field . "_value"} > 20) and ($product_ref->{$field . "_value"} < 2000) - and ($field =~ /weight/)) { + and ($field =~ /weight/) + ) + { assign_value($product_ref, $field . "_unit", "g"); } # We may be passed quantity_value_unit, in that case assign it to quantity - if ((not defined $product_ref->{$field}) + if ( (not defined $product_ref->{$field}) and (defined $product_ref->{$field . "_value_unit"}) - and ($product_ref->{$field . "_value_unit"} ne "")) { + and ($product_ref->{$field . "_value_unit"} ne "")) + { assign_value($product_ref, $field, $product_ref->{$field . "_value_unit"}); } @@ -566,22 +597,38 @@ sub clean_weights($product_ref) { # - a value and a unit ("30 g") # in this case, we can combine them: "2 biscuits (30 g)" - if ((($field eq "quantity") or ($field eq "serving_size")) - and (defined $product_ref->{$field}) and ($product_ref->{$field} ne "") - and (defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") + if ( + (($field eq "quantity") or ($field eq "serving_size")) + and (defined $product_ref->{$field}) + and ($product_ref->{$field} ne "") + and (defined $product_ref->{$field . "_value"}) + and ($product_ref->{$field . "_value"} ne "") and (defined $product_ref->{$field . "_unit"}) # check we have not already combined the value and unit - and (not (index($product_ref->{$field}, $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}) >= 0)) ) { - - assign_value($product_ref, $field, $product_ref->{$field} . " (" . $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"} . ")" ); + and ( + not( + index($product_ref->{$field}, + $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}) >= 0 + ) + ) + ) + { + + assign_value($product_ref, $field, + $product_ref->{$field} . " (" + . $product_ref->{$field . "_value"} . " " + . $product_ref->{$field . "_unit"} + . ")"); } - elsif ((not defined $product_ref->{$field}) + elsif ( (not defined $product_ref->{$field}) and (defined $product_ref->{$field . "_value"}) and ($product_ref->{$field . "_value"} ne "") - and (defined $product_ref->{$field . "_unit"}) ) { + and (defined $product_ref->{$field . "_unit"})) + { - assign_value($product_ref, $field, $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}); + assign_value($product_ref, $field, + $product_ref->{$field . "_value"} . " " . $product_ref->{$field . "_unit"}); } if (defined $product_ref->{$field}) { @@ -629,39 +676,41 @@ sub clean_weights($product_ref) { # poids net total : 200g [zemetro] poids net égoutté : 140g contenance 212ml my %regexps = ( -fr => { -net_weight => '(poids )?net( total)?', -drained_weight => '(poids )?(net )?(égoutté|egoutte)', -volume => '(volume|contenance)( net|nette)?( total)?', -}, + fr => { + net_weight => '(poids )?net( total)?', + drained_weight => '(poids )?(net )?(égoutté|egoutte)', + volume => '(volume|contenance)( net|nette)?( total)?', + }, -# Peso neto: 480 g (6 x 80 g) Peso neto escurrido: 336 g (6x56 g) + # Peso neto: 480 g (6 x 80 g) Peso neto escurrido: 336 g (6x56 g) -es => { -net_weight => '(peso )?neto( total)?', -drained_weight => '(peso )?(neto )?(escurrido)', -#volume => '(volume|contenance)( net|nette)?( total)?', -}, + es => { + net_weight => '(peso )?neto( total)?', + drained_weight => '(peso )?(neto )?(escurrido)', + #volume => '(volume|contenance)( net|nette)?( total)?', + }, ); if (defined $product_ref->{total_weight}) { - $log->debug("clean_weights", { lc => $product_ref->{lc}, total_weight => $product_ref->{total_weight} }) if $log->is_debug(); + $log->debug("clean_weights", {lc => $product_ref->{lc}, total_weight => $product_ref->{total_weight}}) + if $log->is_debug(); if ((defined $product_ref->{lc}) and (defined $regexps{$product_ref->{lc}})) { foreach my $field ("net_weight", "drained_weight", "volume") { - if ((not defined $product_ref->{$field}) - and (defined $regexps{$product_ref->{lc}}{$field})) { + if ( (not defined $product_ref->{$field}) + and (defined $regexps{$product_ref->{lc}}{$field})) + { my $regexp = $regexps{$product_ref->{lc}}{$field}; - if ($product_ref->{total_weight} =~ /$regexp/i ) { + if ($product_ref->{total_weight} =~ /$regexp/i) { my $after = $'; # match number with unit - $log->debug("clean_weights - matched", { field => $field, after => $after }) if $log->is_debug(); + $log->debug("clean_weights - matched", {field => $field, after => $after}) if $log->is_debug(); if ($after =~ /\s?:?\s?(\d[0-9\.\,]+\s*(\w+))/i) { assign_value($product_ref, $field, $1); @@ -692,25 +741,31 @@ drained_weight => '(peso )?(neto )?(escurrido)', } } - my $normalized_quantity; if (defined $product_ref->{quantity}) { $normalized_quantity = normalize_quantity($product_ref->{quantity}); } # empty or incomplete quantity, but net_weight etc. present - if ((not defined $product_ref->{quantity}) or ($product_ref->{quantity} eq "") or (not defined $normalized_quantity) - or (($product_ref->{lc} eq "fr") and ($product_ref->{quantity} =~ /^\d+ tranche([[:alpha:]]*)$/)) # French : "6 tranches épaisses" - or ($product_ref->{quantity} =~ /^\(.+\)$/) # (4 x 125 g) - ) { + if ( + (not defined $product_ref->{quantity}) + or ($product_ref->{quantity} eq "") + or (not defined $normalized_quantity) + or ( ($product_ref->{lc} eq "fr") + and ($product_ref->{quantity} =~ /^\d+ tranche([[:alpha:]]*)$/)) # French : "6 tranches épaisses" + or ($product_ref->{quantity} =~ /^\(.+\)$/) # (4 x 125 g) + ) + { # See if we have other quantity related values: net_weight_value net_weight_unit drained_weight_value drained_weight_unit volume_value volume_unit my $extra_quantity; foreach my $field ("net_weight", "drained_weight", "total_weight", "volume") { - if ((defined $product_ref->{$field}) and ($product_ref->{$field} ne "") - and ($product_ref->{$field} =~ /^\d/) ) { # make sure we have a number + if ( (defined $product_ref->{$field}) + and ($product_ref->{$field} ne "") + and ($product_ref->{$field} =~ /^\d/)) + { # make sure we have a number $extra_quantity = $product_ref->{$field}; last; } @@ -734,7 +789,6 @@ drained_weight => '(peso )?(neto )?(escurrido)', return; } - =head2 clean_fields ( $imported_product_ref ) This function: @@ -758,32 +812,23 @@ my %unspecified = ( 'unspecified', '(not|non)( |-|_)specified', 'not( |-|_)applicable', - 'na', - 'n\/a', - 'unknown', - 'not( |-|_)known', - ], - 'es' => [ - 'no aplica', + 'na', 'n\/a', 'unknown', 'not( |-|_)known', ], + 'es' => ['no aplica',], 'fr' => [ 'non( |-|_)(d(é|e)clar|indiqu|sp(é|e)cifi|renseign)(é|e)(e?)(s?)', 'ras|rien (à|a) signaler', - 'rien,n(é|e)ant', - 'n\/r', - 'nr', - 'inconnu(e?)(s?)', - 'non( |-|_)connu(e?)(s?)', + 'rien,n(é|e)ant', 'n\/r', 'nr', 'inconnu(e?)(s?)', 'non( |-|_)connu(e?)(s?)', ], ); -sub clean_fields($product_ref) { +sub clean_fields ($product_ref) { - $log->debug("clean_fields - start", { }) if $log->is_debug(); + $log->debug("clean_fields - start", {}) if $log->is_debug(); # Quantity in the product name? assign_quantity_from_field($product_ref, "product_name_" . $product_ref->{lc}); - + remove_quantity_from_field($product_ref, "product_name_" . $product_ref->{lc}); # Populate the quantity / weight fields from their quantity_value_unit, quantity_value, quantity_unit etc. components @@ -820,7 +865,7 @@ sub clean_fields($product_ref) { foreach my $field (keys %{$product_ref}) { - $log->debug("clean_fields", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug("clean_fields", {field => $field, value => $product_ref->{$field}}) if $log->is_debug(); if (not defined $product_ref->{$field}) { print STDERR "undefined value for field $field\n"; @@ -862,15 +907,18 @@ sub clean_fields($product_ref) { if (($product_ref->{$field} =~ /^(\s|-|\.|_)$/) and ($product_ref->{$field} ne '-')) { $product_ref->{$field} = ""; } - + # Remove "unspecified" values my @unspecified_lcs = ("en"); - if ((defined $product_ref->{lc}) and ($product_ref->{lc} ne 'en') and (defined $unspecified{$product_ref->{lc}})) { + if ( (defined $product_ref->{lc}) + and ($product_ref->{lc} ne 'en') + and (defined $unspecified{$product_ref->{lc}})) + { push @unspecified_lcs, $product_ref->{lc}; } - + foreach my $l (@unspecified_lcs) { - + foreach my $regexp (@{$unspecified{$l}}) { if ($product_ref->{$field} =~ /^\s*($regexp)\s*$/i) { if (defined $tags_fields{$field}) { @@ -892,15 +940,15 @@ sub clean_fields($product_ref) { # FR 62.907.030 EC (DANS UN OVALE) $product_ref->{$field} =~ s/\(?dans un ovale\)?//ig; - + # FR 72.024.001 EC - FR 72 024 520 CE $product_ref->{$field} =~ s/ (CE|EC) - ([A-Z][A-Z]) / $1, $2/g; } - + # category with "organic" in it if ($field eq "categories") { - $product_ref->{$field} =~ s/^organic //i; # English - $product_ref->{$field} =~ s/ bio$//i; # French + $product_ref->{$field} =~ s/^organic //i; # English + $product_ref->{$field} =~ s/ bio$//i; # French } # Origin of ingredients that contains other things than tags (e.g. Leroux) @@ -923,20 +971,21 @@ sub clean_fields($product_ref) { } if ($field =~ /^(ingredients_text|product_name|abbreviated_product_name|generic_name|brands)/) { - + # Lowercase fields in ALL CAPS # Capitalize all lowercase fields - + # do not count x4 as a lowercase letter # e.g. KINDER COUNTRY BARRE DE CEREALES ENROBEE DE CHOCOLAT 2x9 BARRES - + my $value = $product_ref->{$field}; $value =~ s/x(\d)/X$1/; $value =~ s/(\d)x/$1X/; - - if ((($value =~ /[A-Z]{4}/) and ($value !~ /[a-z]/)) - or (($value =~ /[a-z]{4}/) and ($value !~ /[A-Z]/)) ) { - + + if ( (($value =~ /[A-Z]{4}/) and ($value !~ /[a-z]/)) + or (($value =~ /[a-z]{4}/) and ($value !~ /[A-Z]/))) + { + # Tag field: uppercase the first letter (e.g. brands) if (defined $tags_fields{$field}) { $product_ref->{$field} = join(", ", map {ucfirst} split /, |,/, lc($product_ref->{$field})); @@ -944,32 +993,33 @@ sub clean_fields($product_ref) { else { $product_ref->{$field} = ucfirst(lc($product_ref->{$field})); } - $log->debug("clean_fields - after lowercase", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug("clean_fields - after lowercase", {field => $field, value => $product_ref->{$field}}) + if $log->is_debug(); } - + # Remove fields with "0" if ($product_ref->{$field} ne '-') { $product_ref->{$field} =~ s/^( |0|-|_|\.|\/|\*|;)+$//; } - + # Remove HTML comments $product_ref->{$field} =~ s///sg; - + # if there's some HTML code, making special cases to try to repair is probably more dangerous than just ignoring the fields # e.g. )/) { $product_ref->{$field} = ""; } - + } - + # All fields # Ingredients if ($field =~ /^ingredients_text/) { - + # _x000D_ $product_ref->{$field} =~ s/_x000D_/\n/g; @@ -997,35 +1047,38 @@ sub clean_fields($product_ref) { # extrait de malt d'orge - sel $product_ref->{$field} =~ s/ -( |)<\/b>/<\/b> -$1/ig; - $log->debug("clean_fields - ingredients_text - 1", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); - + $log->debug("clean_fields - ingredients_text - 1", {field => $field, value => $product_ref->{$field}}) + if $log->is_debug(); $product_ref->{$field} =~ s/(.*?)<\/b>/split_allergens($1)/iesg; $product_ref->{$field} =~ s/|<\/b>//ig; - $log->debug("clean_fields - ingredients_text - 2", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); - + $log->debug("clean_fields - ingredients_text - 2", {field => $field, value => $product_ref->{$field}}) + if $log->is_debug(); + # Ingredients without separators # e.g. found in some CodeOnline data: "Ingrédients : Pur cacao de MadagascarŒufs fraisHuiles végétalesGélifiant végétalSucre" - + if ($product_ref->{$field} !~ /,|;| - /) { $product_ref->{$field} =~ s/(\p{Lower}\p{Lower}+)(?=\p{Upper}\p{Lower}\p{Lower})/$1, /g; } - if ($field eq "ingredients_text_fr") { # remove single sentence that say allergens are in bold (in Casino data) - $product_ref->{$field} =~ s/(Les |l')?(information|ingrédient|indication)(s?) ([^\.,]*) (personnes )?((allergiques( (ou|et) intolérant(e|)s)?)|(intolérant(e|)s( (ou|et) allergiques)?))(\.)?//i; + $product_ref->{$field} + =~ s/(Les |l')?(information|ingrédient|indication)(s?) ([^\.,]*) (personnes )?((allergiques( (ou|et) intolérant(e|)s)?)|(intolérant(e|)s( (ou|et) allergiques)?))(\.)?//i; $product_ref->{$field} = ucfirst($product_ref->{$field}); - $log->debug("clean_fields - ingredients_text - 3", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug("clean_fields - ingredients_text - 3", {field => $field, value => $product_ref->{$field}}) + if $log->is_debug(); # Missing spaces # Poire Williams - sucre de canne - sucre - gélifiant : pectines de fruits - acidifiant : acide citrique.Préparée avec 55 g de fruits pour 100 g de produit fini.Teneur totale en sucres 56 g pour 100 g de produit fini.Traces de _fruits à coque_ et de _lait_.. $product_ref->{$field} =~ s/\.([A-Z][a-z])/\. $1/g; - $log->debug("clean_fields - ingredients_text - 4", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug("clean_fields - ingredients_text - 4", {field => $field, value => $product_ref->{$field}}) + if $log->is_debug(); } @@ -1038,23 +1091,31 @@ sub clean_fields($product_ref) { # _d'arachide_ # morceaux _d’amandes_ grillées - if (($field =~ /_fr/) or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) { + if ( ($field =~ /_fr/) + or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) + { $product_ref->{$field} =~ s/_(d|l)('|’)([^_,-;]+)_/$1'_$2_/ig; } } if ($field =~ /^ingredients_text_(\w\w)/) { my $ingredients_lc = $1; - $log->debug("clean_fields - before clean_ingredients_text_for_lang ", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug( + "clean_fields - before clean_ingredients_text_for_lang ", + {field => $field, value => $product_ref->{$field}} + ) if $log->is_debug(); $product_ref->{$field} = clean_ingredients_text_for_lang($product_ref->{$field}, $ingredients_lc); - $log->debug("clean_fields - after clean_ingredients_text_for_lang ", { field=>$field, value=>$product_ref->{$field} }) if $log->is_debug(); + $log->debug( + "clean_fields - after clean_ingredients_text_for_lang ", + {field => $field, value => $product_ref->{$field}} + ) if $log->is_debug(); } if ($field =~ /^nutriscore_grade_/) { $product_ref->{$field} = lc($product_ref->{$field}); } - + if ($field eq "nutriscore_grade_producer") { # Nutriscore_A -> a $product_ref->{$field} =~ s/(nutri-score|nutriscore)(\s|:|-|_|\.)+([a-e])/$3/i; @@ -1062,30 +1123,33 @@ sub clean_fields($product_ref) { # remove N, N/A, NA etc. # but not "no", "none" that are useful values (e.g. for specific labels "organic:no", allergens : "none") - $product_ref->{$field} =~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig; - + $product_ref->{$field} + =~ s/(^|,)\s*((n(\/|\.)?a(\.)?)|(not applicable)|unknown|inconnu|inconnue|non renseigné|non applicable|no aplica|nr|n\/r)\s*(,|$)//ig; + # remove none except for allergens and traces if ($field !~ /allergens|traces/) { - $product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig; + $product_ref->{$field} =~ s/(^|,)\s*(none|aucun|aucune|aucun\(e\))\s*(,|$)//ig; } - if (($field =~ /_fr/) or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) { + if ( ($field =~ /_fr/) + or ((defined $product_ref->{lc}) and ($product_ref->{lc} eq 'fr') and ($field !~ /_\w\w$/))) + { $product_ref->{$field} =~ s/^\s*(autre logo)?\s*$//ig; } $product_ref->{$field} =~ s/ +/ /g; $product_ref->{$field} =~ s/,(\s*),/,/g; $product_ref->{$field} =~ s/\.(\.+)$/\./; - + # Don't remove a single dash -, it is used to indicate an existing value should be deleted if ($product_ref->{$field} ne '-') { - + # Remove trailing dashes and commas $product_ref->{$field} =~ s/(\s|-|;|,)*$//; # Remove leading dashes, commas and dots # be careful not to turn -5 to 5: remove dashes only if they are not followed by a number $product_ref->{$field} =~ s/^(\s|-(?![0-9])|;|,|\.)+//; - + # Remove entries made entirely of punctuation characters, or x or X $product_ref->{$field} =~ s/^(x|X|,|;|-|_|\/|\\|#|:|\.|\s)+$//; } @@ -1107,7 +1171,6 @@ sub clean_fields($product_ref) { return; } - sub clean_fields_for_all_products() { foreach my $code (sort keys %products) { @@ -1117,14 +1180,13 @@ sub clean_fields_for_all_products() { return; } - -sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { +sub load_xml_file ($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { # $code can be undef or passed if we already know it from the file name # try to guess the code from the file name if ((not defined $code) and ($file =~ /\D(\d{13})\D/)) { $code = $1; - $log->info("inferring code from file name", { code => $code, file => $file }) if $log->is_info(); + $log->info("inferring code from file name", {code => $code, file => $file}) if $log->is_info(); } @@ -1132,21 +1194,21 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $code = normalize_code($code); } - $log->info("parsing xml file with XML::Rules", { file => $file, xml_rules => $xml_rules_ref }) if $log->is_info(); + $log->info("parsing xml file with XML::Rules", {file => $file, xml_rules => $xml_rules_ref}) if $log->is_info(); my $parser = XML::Rules->new(rules => $xml_rules_ref); my $xml_ref; - eval { $xml_ref = $parser->parse_file($file); }; + eval {$xml_ref = $parser->parse_file($file);}; if ($@ ne "") { - $log->error("error parsing xml file with XML::Rules", { file => $file, error=>$@ }) if $log->is_error(); + $log->error("error parsing xml file with XML::Rules", {file => $file, error => $@}) if $log->is_error(); push @xml_errors, $file; #exit; } - $log->trace("XML::Rules output", { file => $file, xml_ref => $xml_ref }) if $log->is_trace(); + $log->trace("XML::Rules output", {file => $file, xml_ref => $xml_ref}) if $log->is_trace(); # Skip empty XML files if (not defined $xml_ref) { @@ -1155,36 +1217,36 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { if ($log->is_trace()) { binmode STDOUT, ":encoding(UTF-8)"; - open (my $OUT_JSON, ">", "$www_root/data/import_debug_xml.json"); + open(my $OUT_JSON, ">", "$www_root/data/import_debug_xml.json"); print $OUT_JSON encode_json($xml_ref); - close ($OUT_JSON); + close($OUT_JSON); } # Some producers (e.g. Auchan) have multiple product codes in one file, with multiple label field values, # but without an actual id to make the mapping. -# -#- -# -# -# -# -# -# -# -# ... -#+ -#- -#Crème Dessert Chocolat Caramel Auchan x4 -#Chocolat Caramel x 4 -#Crème dessert Caram'choc -#Crème dessert aromatisée caramel chocolat - -# multiple_codes => { -# codes => codes, # all sub fields will be moved to the root of the split children -# fuzzy_match => "etiquettes", # if exists, specify a field that depends on the child -# fuzzy_from => "DenominationCommerciale", # value from "codes" that will be fuzzy matched to find the id for "fuzzy_match" hash -# }, + # + #- + # + # + # + # + # + # + # + # ... + #+ + #- + #Crème Dessert Chocolat Caramel Auchan x4 + #Chocolat Caramel x 4 + #Crème dessert Caram'choc + #Crème dessert aromatisée caramel chocolat + + # multiple_codes => { + # codes => codes, # all sub fields will be moved to the root of the split children + # fuzzy_match => "etiquettes", # if exists, specify a field that depends on the child + # fuzzy_from => "DenominationCommerciale", # value from "codes" that will be fuzzy matched to find the id for "fuzzy_match" hash + # }, my @xml_refs = (); @@ -1194,7 +1256,7 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { my $array = $xml_fields_mapping_ref->[0][1]; - $log->info("Split multiple products", { file => $file, array => $array }) if $log->is_info(); + $log->info("Split multiple products", {file => $file, array => $array}) if $log->is_info(); if (defined $xml_ref->{$array}) { my $i = 1; @@ -1210,11 +1272,10 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { } - # Multiple variant of one product, with different codes? elsif ($xml_fields_mapping_ref->[0][0] eq "multiple_codes") { - $log->info("Split multiple codes (product variants)", { file => $file }) if $log->is_info(); + $log->info("Split multiple codes (product variants)", {file => $file}) if $log->is_info(); my $codes = $xml_fields_mapping_ref->[0][1]{codes}; @@ -1229,7 +1290,7 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $fuzzy_match = $xml_fields_mapping_ref->[0][1]{fuzzy_match}; if (defined $xml_ref->{$fuzzy_match}) { @fuzzy_match_keys = sort keys %{$xml_ref->{$fuzzy_match}}; - @fuzzy_match_keysid = map { get_string_id_for_lang("no_language", $_) } @fuzzy_match_keys; + @fuzzy_match_keysid = map {get_string_id_for_lang("no_language", $_)} @fuzzy_match_keys; } } @@ -1238,14 +1299,15 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { $new_code = normalize_code($new_code); - $log->info("Split multiple products - code", { code => $new_code }) if $log->is_info(); + $log->info("Split multiple products - code", {code => $new_code}) if $log->is_info(); my $new_xml_ref = dclone($xml_ref); $new_xml_ref->{code} = $new_code; foreach my $field (sort keys %{$xml_ref->{$codes}{$new_code}}) { - $log->info("Split multiple products - copy field", { code => $new_code, field => $field }) if $log->is_info(); + $log->info("Split multiple products - copy field", {code => $new_code, field => $field}) + if $log->is_info(); $new_xml_ref->{$field} = $xml_ref->{$codes}{$new_code}{$field}; } @@ -1255,17 +1317,19 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { my $fuzzy_from = $xml_fields_mapping_ref->[0][1]{fuzzy_from}; - $log->info("Fuzzy match", { fuzzy_from => $fuzzy_from }) if $log->is_info(); + $log->info("Fuzzy match", {fuzzy_from => $fuzzy_from}) if $log->is_info(); if (defined $new_xml_ref->{$fuzzy_from}) { - my $tf = Text::Fuzzy->new (get_string_id_for_lang("no_language", $new_xml_ref->{$fuzzy_from})); - my $nearestid = $tf->nearest (\@fuzzy_match_keysid); + my $tf = Text::Fuzzy->new(get_string_id_for_lang("no_language", $new_xml_ref->{$fuzzy_from})); + my $nearestid = $tf->nearest(\@fuzzy_match_keysid); my $nearest = $fuzzy_match_keys[$nearestid]; - $log->info("Fuzzy match found", { fuzzy_from => $fuzzy_from, value => $new_xml_ref->{$fuzzy_from}, nearest => $nearest }) if $log->is_info(); + $log->info("Fuzzy match found", + {fuzzy_from => $fuzzy_from, value => $new_xml_ref->{$fuzzy_from}, nearest => $nearest}) + if $log->is_info(); foreach my $field (sort keys %{$xml_ref->{$fuzzy_match}{$nearest}}) { - $log->info("Fuzzy match - copy field", { field => $field }) if $log->is_info(); + $log->info("Fuzzy match - copy field", {field => $field}) if $log->is_info(); $new_xml_ref->{$field} = $xml_ref->{$fuzzy_match}{$nearest}{$field}; } @@ -1285,15 +1349,15 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { push @xml_refs, $xml_ref; } - $log->info("Mapping XML fields", { file => $file }) if $log->is_info(); + $log->info("Mapping XML fields", {file => $file}) if $log->is_info(); -# my @xml_fields_mapping = ( -# -# # get the code first -# -# ["fields.AL_CODE_EAN.FR", "code"], -# ["ProductCode", "producer_version_id"], -# ["fields.AL_INGREDIENT.*", "ingredients_text_*"], + # my @xml_fields_mapping = ( + # + # # get the code first + # + # ["fields.AL_CODE_EAN.FR", "code"], + # ["ProductCode", "producer_version_id"], + # ["fields.AL_INGREDIENT.*", "ingredients_text_*"], # $code = undef; @@ -1301,149 +1365,161 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { foreach my $xml_ref (@xml_refs) { - my $product_ref; + my $product_ref; - if (defined $code) { - $product_ref = get_or_create_product_for_code($code); - } + if (defined $code) { + $product_ref = get_or_create_product_for_code($code); + } - foreach my $field_mapping_ref (@{$xml_fields_mapping_ref}) { - my $source = $field_mapping_ref->[0]; - my $target = $field_mapping_ref->[1]; + foreach my $field_mapping_ref (@{$xml_fields_mapping_ref}) { + my $source = $field_mapping_ref->[0]; + my $target = $field_mapping_ref->[1]; - $log->trace("source $i", { source=>$source, target=>$target }) if $log->is_trace(); + $log->trace("source $i", {source => $source, target => $target}) if $log->is_trace(); - my $current_tag = $xml_ref; + my $current_tag = $xml_ref; - print STDERR "\nsource: $source\n"; + print STDERR "\nsource: $source\n"; - foreach my $source_tag (split(/\./, $source)) { - print STDERR "source_tag: $source_tag\n"; + foreach my $source_tag (split(/\./, $source)) { + print STDERR "source_tag: $source_tag\n"; - # commands + # commands - # ["[delete_except]", "producer|emb_codes|origin"], + # ["[delete_except]", "producer|emb_codes|origin"], - if ($source_tag eq '[delete_except]') { - my $regexp = $target; - foreach my $field ( sort keys %{$product_ref}) { - next if $field eq 'code'; - next if $field =~ /$regexp/i; - $log->trace("deleting existing field", { field=>$field }) if $log->is_trace(); - delete $product_ref->{$field}; + if ($source_tag eq '[delete_except]') { + my $regexp = $target; + foreach my $field (sort keys %{$product_ref}) { + next if $field eq 'code'; + next if $field =~ /$regexp/i; + $log->trace("deleting existing field", {field => $field}) if $log->is_trace(); + delete $product_ref->{$field}; + } } - } - # multiple values in different languages + # multiple values in different languages - elsif ($source_tag eq '*') { - foreach my $tag ( sort keys %{$current_tag}) { - my $tag_target = $target; + elsif ($source_tag eq '*') { + foreach my $tag (sort keys %{$current_tag}) { + my $tag_target = $target; - # special case where we have something like allergens.nuts = traces - if ($tag_target eq "value_as_target_and_source_as_value") { - print STDERR "* tag key: $tag - target: $tag_target\n"; - if ((defined $current_tag->{$tag}) and (not ref($current_tag->{$tag})) and ($current_tag->{$tag} ne '')) { - print STDERR "assign $tag to $current_tag->{$tag}\n"; + # special case where we have something like allergens.nuts = traces + if ($tag_target eq "value_as_target_and_source_as_value") { + print STDERR "* tag key: $tag - target: $tag_target\n"; + if ( (defined $current_tag->{$tag}) + and (not ref($current_tag->{$tag})) + and ($current_tag->{$tag} ne '')) + { + print STDERR "assign $tag to $current_tag->{$tag}\n"; - assign_value($product_ref, $current_tag->{$tag}, $tag); + assign_value($product_ref, $current_tag->{$tag}, $tag); + } } - } - else { - - $tag_target =~ s/\*/$tag/; - $tag_target = lc($tag_target); - print STDERR "* tag key: $tag - target: $tag_target\n"; - if ((defined $current_tag->{$tag}) and (not ref($current_tag->{$tag})) and ($current_tag->{$tag} ne '')) { - print STDERR "$tag value is a scalar: $current_tag->{$tag}, assign value to $tag_target\n"; - if ($tag_target eq 'code') { - $code = $current_tag->{$tag}; + else { - $code = normalize_code($code); - $product_ref = get_or_create_product_for_code($code); - } - assign_value($product_ref, $tag_target, $current_tag->{$tag}); + $tag_target =~ s/\*/$tag/; + $tag_target = lc($tag_target); + print STDERR "* tag key: $tag - target: $tag_target\n"; + if ( (defined $current_tag->{$tag}) + and (not ref($current_tag->{$tag})) + and ($current_tag->{$tag} ne '')) + { + print STDERR + "$tag value is a scalar: $current_tag->{$tag}, assign value to $tag_target\n"; + if ($tag_target eq 'code') { + $code = $current_tag->{$tag}; + + $code = normalize_code($code); + $product_ref = get_or_create_product_for_code($code); + } + assign_value($product_ref, $tag_target, $current_tag->{$tag}); - if ($tag_target eq 'emb_codes') { - print STDERR "emb_codes : " . $product_ref->{$tag_target} . "\n"; + if ($tag_target eq 'emb_codes') { + print STDERR "emb_codes : " . $product_ref->{$tag_target} . "\n"; + } } } } + last; } - last; - } - # Array - e.g. ["nutrients.ENERKJ.[0].RoundValue", "nutriments.energy_kJ"], + # Array - e.g. ["nutrients.ENERKJ.[0].RoundValue", "nutriments.energy_kJ"], - elsif ($source_tag =~ /^\[(\d+)\]$/) { - my $i = $1; - if ((ref($current_tag) eq 'ARRAY') and (defined $current_tag->[$i])) { - print STDERR "going down to array element $source_tag - $i\n"; - $current_tag = $current_tag->[$i]; + elsif ($source_tag =~ /^\[(\d+)\]$/) { + my $i = $1; + if ((ref($current_tag) eq 'ARRAY') and (defined $current_tag->[$i])) { + print STDERR "going down to array element $source_tag - $i\n"; + $current_tag = $current_tag->[$i]; + } } - } - # Array with several versions identified by a number, take the highest one - # {$version}) and (not defined $max) or ($version_ref->{$version} > $max)) { - $max = $version_ref->{$version}; - $max_version_ref = $version_ref; + # Array with several versions identified by a number, take the highest one + # {$version}) and (not defined $max) + or ($version_ref->{$version} > $max)) + { + $max = $version_ref->{$version}; + $max_version_ref = $version_ref; + } + } + if (defined $max_version_ref) { + print STDERR "going down to array element $source_tag - version $max\n"; + $current_tag = $max_version_ref; } - } - if (defined $max_version_ref) { - print STDERR "going down to array element $source_tag - version $max\n"; - $current_tag = $max_version_ref; } } - } - elsif (defined $current_tag->{$source_tag}) { - if ((ref($current_tag->{$source_tag}) eq 'HASH') or (ref($current_tag->{$source_tag}) eq 'ARRAY')) { - print STDERR "going down to hash $source_tag\n"; - $current_tag = $current_tag->{$source_tag}; - } - elsif ((defined $current_tag->{$source_tag}) and (not ref($current_tag->{$source_tag})) and ($current_tag->{$source_tag} ne '')) { + elsif (defined $current_tag->{$source_tag}) { + if ((ref($current_tag->{$source_tag}) eq 'HASH') or (ref($current_tag->{$source_tag}) eq 'ARRAY')) { + print STDERR "going down to hash $source_tag\n"; + $current_tag = $current_tag->{$source_tag}; + } + elsif ( (defined $current_tag->{$source_tag}) + and (not ref($current_tag->{$source_tag})) + and ($current_tag->{$source_tag} ne '')) + { - my $value = $current_tag->{$source_tag}; + my $value = $current_tag->{$source_tag}; - print STDERR "$source_tag is a scalar: $value, assign value to $target\n"; - if ($target eq 'code') { - $code = $value; - $code = normalize_code($code); - $product_ref = get_or_create_product_for_code($code); - } + print STDERR "$source_tag is a scalar: $value, assign value to $target\n"; + if ($target eq 'code') { + $code = $value; + $code = normalize_code($code); + $product_ref = get_or_create_product_for_code($code); + } - my $seen_energy_kj = 0; + my $seen_energy_kj = 0; - if ($target =~ /^nutriments.(.*)/) { - $target = $1; + if ($target =~ /^nutriments.(.*)/) { + $target = $1; - # skip energy in kcal if we already have energy in kJ - if (($seen_energy_kj) and ($target =~ /kcal/i)) { - next; - } + # skip energy in kcal if we already have energy in kJ + if (($seen_energy_kj) and ($target =~ /kcal/i)) { + next; + } - if ($target =~ /kj/i) { - $seen_energy_kj = 1; - } + if ($target =~ /kj/i) { + $seen_energy_kj = 1; + } - $value =~ s/,/\./; + $value =~ s/,/\./; - if ($target =~ /^(.*)_value$/) { - assign_value($product_ref, $target, $value); - } - elsif ($target =~ /^(.*)_unit$/) { - assign_value($product_ref, $target, $value); - } - elsif ($target =~ /^(.*)_([^_]+)$/) { + if ($target =~ /^(.*)_value$/) { + assign_value($product_ref, $target, $value); + } + elsif ($target =~ /^(.*)_unit$/) { + assign_value($product_ref, $target, $value); + } + elsif ($target =~ /^(.*)_([^_]+)$/) { $target = $1; my $unit = $2; assign_value($product_ref, $target . "_value", $value); @@ -1453,31 +1529,30 @@ sub load_xml_file($file, $xml_rules_ref, $xml_fields_mapping_ref, $code) { else { assign_value($product_ref, $target . "_unit", ""); } + } + else { + assign_value($product_ref, $target . "_value", $value); + } } else { - assign_value($product_ref, $target . "_value", $value); + assign_value($product_ref, $target, $value); } } - else { - assign_value($product_ref, $target, $value); - } + } + else { + last; } } - else { - last; - } - } - $i++; - } + $i++; + } - } #foreach @xml_refs + } #foreach @xml_refs return 0; } - -sub load_csv_file($options_ref) { +sub load_csv_file ($options_ref) { my $file = $options_ref->{file}; my $encoding = $options_ref->{encoding}; @@ -1490,37 +1565,37 @@ sub load_csv_file($options_ref) { # e.g. load_csv_file($file, "UTF-8", "\t", 4); - $log->info("Loading CSV file", { file => $file }) if $log->is_info(); + $log->info("Loading CSV file", {file => $file}) if $log->is_info(); - my $csv_options_ref = { binary => 1 , sep_char => $separator }; + my $csv_options_ref = {binary => 1, sep_char => $separator}; if (defined $options_ref->{escape_char}) { $csv_options_ref->{escape_char} = $options_ref->{escape_char}; } - my $csv = Text::CSV->new ( $csv_options_ref ) # should set binary attribute. - or die "Cannot use CSV: " . Text::CSV->error_diag (); + my $csv = Text::CSV->new($csv_options_ref) # should set binary attribute. + or die "Cannot use CSV: " . Text::CSV->error_diag(); - open (my $io, "<:encoding($encoding)", $file) or die("Could not open $file: $!"); + open(my $io, "<:encoding($encoding)", $file) or die("Could not open $file: $!"); my $i = 0; # line number if (defined $skip_lines) { $log->info("Skipping $skip_lines lines before header") if $log->is_info(); for ($i = 0; $i < $skip_lines; $i++) { - $csv->getline ($io); + $csv->getline($io); } } #my $headers_ref = $csv->getline ($io); $i++; - $csv->header ($io, { detect_bom => 1 }); + $csv->header($io, {detect_bom => 1}); if (defined $skip_lines_after_header) { $log->info("Skipping $skip_lines_after_header lines after header") if $log->is_info(); for (my $j = 0; $j < $skip_lines_after_header; $j++) { - $csv->getline ($io); + $csv->getline($io); $i++; } } @@ -1531,9 +1606,9 @@ sub load_csv_file($options_ref) { my $product_ref; - while (my $csv_product_ref = $csv->getline_hr ($io)) { + while (my $csv_product_ref = $csv->getline_hr($io)) { - $i++; # line number + $i++; # line number $log->info("Reading line $i") if $log->is_info(); @@ -1559,8 +1634,9 @@ sub load_csv_file($options_ref) { my $source_condition_field = $field_mapping_ref->[$condition][0]; my $source_condition_value = $field_mapping_ref->[$condition][1]; - if ((not defined $csv_product_ref->{$source_condition_field}) - or ($csv_product_ref->{$source_condition_field} ne $source_condition_value)) { + if ( (not defined $csv_product_ref->{$source_condition_field}) + or ($csv_product_ref->{$source_condition_field} ne $source_condition_value)) + { $match = 0; } @@ -1583,7 +1659,10 @@ sub load_csv_file($options_ref) { print STDERR "skipping invalid code\n"; last; } - elsif ((defined $skip_non_existing_products) and ($skip_non_existing_products) and (not exists $products{$code})) { + elsif ( (defined $skip_non_existing_products) + and ($skip_non_existing_products) + and (not exists $products{$code})) + { print STDERR "skipping non existing product\n"; last; } @@ -1605,7 +1684,7 @@ sub load_csv_file($options_ref) { my $dir = $'; $dir =~ s/\/$//; - my $file = $csv_product_ref->{$source_field}; + my $file = $csv_product_ref->{$source_field}; $file =~ s/.*\///; $file =~ s/[^A-Za-z0-9-_\.]/_/g; @@ -1616,7 +1695,7 @@ sub load_csv_file($options_ref) { # do not download again images that we already have # but try again if the size is 0 - if ((! -e "$dir/$file") or ((-s "$dir/$file") < 10000)) { + if ((!-e "$dir/$file") or ((-s "$dir/$file") < 10000)) { print STDERR "downloading image: wget $csv_product_ref->{$source_field} -O $dir/$file\n"; system("wget \"" . $csv_product_ref->{$source_field} . "\" -O $dir/$file"); @@ -1639,15 +1718,15 @@ sub load_csv_file($options_ref) { } if ($target_field =~ /^(.*)_([^_]+)$/) { - $target_field = $1; - my $unit = $2; - assign_value($product_ref, $target_field . "_value", $value); - if ($value ne "") { - assign_value($product_ref, $target_field . "_unit", $unit); - } - else { - assign_value($product_ref, $target_field . "_unit", ""); - } + $target_field = $1; + my $unit = $2; + assign_value($product_ref, $target_field . "_value", $value); + if ($value ne "") { + assign_value($product_ref, $target_field . "_unit", $unit); + } + else { + assign_value($product_ref, $target_field . "_unit", ""); + } } else { assign_value($product_ref, $target_field . "_value", $value); @@ -1668,7 +1747,9 @@ sub load_csv_file($options_ref) { } } else { - $log->error("undefined source field", { line => $i, source_field=>$source_field, csv_product_ref=>$csv_product_ref }) if $log->is_error(); + $log->error("undefined source field", + {line => $i, source_field => $source_field, csv_product_ref => $csv_product_ref}) + if $log->is_error(); die; } } @@ -1678,7 +1759,7 @@ sub load_csv_file($options_ref) { return; } -sub recursive_list($list_ref, $arg) { +sub recursive_list ($list_ref, $arg) { if (-d $arg) { @@ -1686,8 +1767,8 @@ sub recursive_list($list_ref, $arg) { print STDERR "Opening dir $dir\n"; - if (opendir (DH, "$dir")) { - foreach my $file (sort { $a cmp $b } readdir(DH)) { + if (opendir(DH, "$dir")) { + foreach my $file (sort {$a cmp $b} readdir(DH)) { next if (($file eq '.') or ($file eq '..')); @@ -1695,7 +1776,7 @@ sub recursive_list($list_ref, $arg) { } } - closedir (DH); + closedir(DH); } else { push @{$list_ref}, $arg; @@ -1704,7 +1785,7 @@ sub recursive_list($list_ref, $arg) { return; } -sub get_list_of_files(@files_and_dirs) { +sub get_list_of_files (@files_and_dirs) { # Read the list of files or directories passed as parameters @@ -1720,14 +1801,13 @@ sub get_list_of_files(@files_and_dirs) { return @files; } +sub print_csv_file ($file_handle) { + my $csv_out + = Text::CSV->new({binary => 1, sep_char => "\t", eol => "\n", quote_space => 0}) # should set binary attribute. + or die "Cannot use CSV: " . Text::CSV->error_diag(); -sub print_csv_file($file_handle) { - - my $csv_out = Text::CSV->new ( { binary => 1 , sep_char => "\t", eol => "\n", quote_space => 0 } ) # should set binary attribute. - or die "Cannot use CSV: ".Text::CSV->error_diag (); - - $csv_out->print ($file_handle, \@fields) ; + $csv_out->print($file_handle, \@fields); foreach my $code (sort keys %products) { @@ -1743,7 +1823,7 @@ sub print_csv_file($file_handle) { } } - $csv_out->print ($file_handle, \@values) ; + $csv_out->print($file_handle, \@values); print STDERR "code: $code\n"; } @@ -1751,7 +1831,6 @@ sub print_csv_file($file_handle) { return; } - sub print_stats() { my %existing_values = (); @@ -1780,7 +1859,6 @@ sub print_stats() { return; } - =head2 extract_nutrition_facts_from_text ( LC, TEXT, NUTRIENTS_REF ) C extract nutrition facts from a text @@ -1812,7 +1890,7 @@ Reference to a scalar that will be set to the serving size if the nutrition fact =cut -sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutrition_data_per_ref, $serving_size_ref) { +sub extract_nutrition_facts_from_text ($text_lc, $text, $nutrients_ref, $nutrition_data_per_ref, $serving_size_ref) { if ((defined $text) and ($text ne "")) { @@ -1825,7 +1903,10 @@ sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutritio } } elsif ($text_lc eq "fr") { - if ($text =~ /^\s*(à la |a la |pour |par |)(1 |une )?portion (de |d'environ )?\(? ?(\d+((\.|,)\d+)? ?(g|kg|mg|µg|l|dl|cl|ml))/i) { + if ($text + =~ /^\s*(à la |a la |pour |par |)(1 |une )?portion (de |d'environ )?\(? ?(\d+((\.|,)\d+)? ?(g|kg|mg|µg|l|dl|cl|ml))/i + ) + { ${$nutrition_data_per_ref} = "serving"; ${$serving_size_ref} = $4; } @@ -1866,7 +1947,9 @@ sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutritio # Vitamine D µg 0.4 soit 8 % des AQR* - if ($text =~ /\b$synonym\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)/i) { + if ($text + =~ /\b$synonym\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)/i) + { $unit = $1; $value = $5; if ((defined $3) and ($3 ne "")) { @@ -1894,7 +1977,9 @@ sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutritio $value = 0; last; } - elsif ($text =~ /\b$synonym(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?/i) { + elsif ($text + =~ /\b$synonym(\s|:)*(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)\b\)?/i) + { $value = $4; $unit = $7; if ((defined $2) and ($2 ne "")) { @@ -1909,7 +1994,9 @@ sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutritio last; } # missing unit... assume g ? - elsif ($text =~ /\b$synonym(\s|:)+(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)?\)?\b/i) { + elsif ($text + =~ /\b$synonym(\s|:)+(<|~)?(\s)*(\d+((\.|\,)\d+)?)\s*\(?(g|kg|mg|µg|l|dl|cl|ml|kj|kcal)?\)?\b/i) + { $value = $4; $unit = "g"; if ((defined $2) and ($2 ne "")) { @@ -1958,7 +2045,5 @@ sub extract_nutrition_facts_from_text($text_lc, $text, $nutrients_ref, $nutritio return; } - - 1; From 7c2e77aec720f3d28f7acea11bf487fb3f08c67b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Fri, 3 Feb 2023 17:28:27 +0100 Subject: [PATCH 6/8] remove unused scripts --- scripts/bayard-import/README.md | 32 ------------------------ scripts/bayard-import/bayard-xml2json.js | 32 ------------------------ scripts/bayard-import/find_and_copy.sh | 1 - 3 files changed, 65 deletions(-) delete mode 100644 scripts/bayard-import/README.md delete mode 100644 scripts/bayard-import/bayard-xml2json.js delete mode 100644 scripts/bayard-import/find_and_copy.sh diff --git a/scripts/bayard-import/README.md b/scripts/bayard-import/README.md deleted file mode 100644 index 0b46333f53dc0..0000000000000 --- a/scripts/bayard-import/README.md +++ /dev/null @@ -1,32 +0,0 @@ - -# Requirements -- node (https://nodejs.org/) -- miller >=5.0 (https://johnkerl.org/miller/doc/) - -On stretch: -apt-get -t stretch-backports install miller - -# Usage -Install the xml2csv node module found here: https://github.com/odtvince/xml2csv - -As the off user: - -``` -mkdir ~/npm-global -export NPM_CONFIG_PREFIX=~/.npm-global -/srv/off-pro/scripts/equadis-import# git clone https://github.com/odtvince/xml2csv.git -/srv/off-pro/scripts/equadis-import# cd xml2csv/ -/srv/off-pro/scripts/equadis-import/xml2csv# npm link -/srv/off-pro/scripts/equadis-import/xml2csv# cd .. -/srv/off-pro/scripts/equadis-import# npm link xml2csv -``` - -Put all the xml files to import into the `equadis-data` directory - -Then execute: -``` -node equadis-xml2csv.js -./equadis2off.sh > equadis-data.tsv -./dereference.sh equadis-data.tsv -``` - diff --git a/scripts/bayard-import/bayard-xml2json.js b/scripts/bayard-import/bayard-xml2json.js deleted file mode 100644 index b381ae4275804..0000000000000 --- a/scripts/bayard-import/bayard-xml2json.js +++ /dev/null @@ -1,32 +0,0 @@ -// This script is used to convert GDSN data from Equadis in XML format -// to a corresponding JSON structure - -const xml2json = require('xml2json') -const fs = require("fs") - -const directoryPath = "/srv2/off-pro/bayard-data-tmp/" - -const filter = /\.xml$/ - -// force arrays for some fields even if there is only one value supplied -const options = { - arrayNotation: ['nutrientHeader', 'allergen', 'packagingMarkedLabelAccreditationCode'] -}; - -fs.readdir(directoryPath, function(err, files) { - if (err) { - console.log("Error getting directory information.") - } else { - files.forEach(function(file) { - - if (filter.test(file)) { - - let content = fs.readFileSync(directoryPath+file, 'utf8'); - let json = xml2json.toJson(content, options); - fs.writeFileSync(directoryPath+file.replace('.xml','.json'), json); - } - - }) - } -}) - diff --git a/scripts/bayard-import/find_and_copy.sh b/scripts/bayard-import/find_and_copy.sh deleted file mode 100644 index 975639f195bbd..0000000000000 --- a/scripts/bayard-import/find_and_copy.sh +++ /dev/null @@ -1 +0,0 @@ -find /home/sftp/equadis/data/ -mtime -5 -type f -exec grep -q 'NATURENVIE' {} \; -exec cp {} /srv2/off-pro/equadis-data-tmp/ \; From ffdb5c4376b25b61575750da756fef9bae6d0859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Tue, 7 Feb 2023 11:34:51 +0100 Subject: [PATCH 7/8] update tests --- .../import_gs1/agena3000_andros.off.json | 2 +- .../expected_test_results/import_gs1/agena3000_aoste.off.json | 2 +- .../import_gs1/agena3000_lactalis.off.json | 2 +- .../equadis_brasseries_kronenbourg_grimbergen_rouge.off.json | 4 ++-- .../equadis_brasseries_kronenbourg_tourtel.off.json | 2 +- .../import_gs1/equadis_knorr_child_item.off.json | 2 +- .../import_gs1/equadis_knorr_pesto_rouge.off.json | 2 +- .../import_gs1/equadis_nestle_france_ricore.off.json | 2 +- .../import_gs1/equadis_panzani_container.off.json | 2 +- .../import_gs1/equadis_pepsico_chips.off.json | 2 +- ...equadis_unilever_magnum_100g_plus_100ml_nutrition.off.json | 2 +- .../import_gs1/equadis_unilever_magnum_mini_ruby.off.json | 2 +- .../import_gs1/equadis_unilever_moutarde_maille.off.json | 2 +- .../import_gs1/equadis_unilever_obf_signal.off.json | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/unit/expected_test_results/import_gs1/agena3000_andros.off.json b/tests/unit/expected_test_results/import_gs1/agena3000_andros.off.json index d2ae113857413..5cb5d43345efe 100644 --- a/tests/unit/expected_test_results/import_gs1/agena3000_andros.off.json +++ b/tests/unit/expected_test_results/import_gs1/agena3000_andros.off.json @@ -28,7 +28,7 @@ "net_weight" : "0.36 kg", "nutriscore_grade_producer" : "D", "org_name" : "ANDROS", - "packaging" : "Bocal", + "packaging" : "en:jar", "preparation_en" : "Keep away from heat and light. Don't put in the fridge.", "preparation_fr" : "A conserver à l'abri de la chaleur et de la lumière. Ne pas mettre au réfrigérateur", "producer_version_id" : "PREMIERE VERSION", diff --git a/tests/unit/expected_test_results/import_gs1/agena3000_aoste.off.json b/tests/unit/expected_test_results/import_gs1/agena3000_aoste.off.json index 080e3f0bd649b..1b8592667aef5 100644 --- a/tests/unit/expected_test_results/import_gs1/agena3000_aoste.off.json +++ b/tests/unit/expected_test_results/import_gs1/agena3000_aoste.off.json @@ -25,7 +25,7 @@ "net_weight" : "0.17 kg", "org_name" : "AOSTE SNC", "origin_fr" : "Origine du produit fini « Saucisson sec qualité supérieure 47,1% et Emmental - fromage à pâte pressée cuite 52,9%. » : Belgique # Origine de l’ingrédient primaire « Viande de porc » : France # Origine de l’ingrédient primaire « Lait de vache pasteurisé » : France", - "packaging" : "Sac de transport", + "packaging" : "en:carrying-bag", "producer_version_id" : "Création", "product_name_fr" : "JBRI DUO SAUC EMMENTAL 170G", "proteins_100g_unit" : "g", diff --git a/tests/unit/expected_test_results/import_gs1/agena3000_lactalis.off.json b/tests/unit/expected_test_results/import_gs1/agena3000_lactalis.off.json index f31533186d324..6786d13f33c0f 100644 --- a/tests/unit/expected_test_results/import_gs1/agena3000_lactalis.off.json +++ b/tests/unit/expected_test_results/import_gs1/agena3000_lactalis.off.json @@ -37,7 +37,7 @@ "lc" : "fr", "net_weight" : "0.25 kg", "nutriscore_grade_producer" : "EXEMPT", - "packaging" : "Boite", + "packaging" : "en:box", "producer_version_id" : "Création", "product_name_fr" : "PRESIDENT CAMEMBERT 250g", "proteins_100g_unit" : "g", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_grimbergen_rouge.off.json b/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_grimbergen_rouge.off.json index 016af4a362861..768d65805fca2 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_grimbergen_rouge.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_grimbergen_rouge.off.json @@ -25,7 +25,7 @@ "net_weight" : "0.25 kg", "nutriscore_grade_producer" : "EXEMPT", "org_name" : "BRASSERIES KRONENBOURG", - "packaging" : "Bouteille", + "packaging" : "en:bottle", "product_name_fr" : "GRIMBERGEN - 25CL GRIMBERGEN ROUGE - 5.50 DEGRE ALCOOL", "proteins_100g_unit" : "g", "proteins_100g_value" : "0.4", @@ -73,7 +73,7 @@ "net_weight" : "1.5 kg", "nutriscore_grade_producer" : "EXEMPT", "org_name" : "BRASSERIES KRONENBOURG", - "packaging" : "Bouteille", + "packaging" : "en:bottle", "product_name_fr" : "GRIMBERGEN - 6X25CL GRIMBERGEN ROUGE - 5.50 DEGRE ALCOOL", "proteins_100g_unit" : "g", "proteins_100g_value" : "0.4", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_tourtel.off.json b/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_tourtel.off.json index 8f302f2f9fe4b..a413489f303fe 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_tourtel.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_brasseries_kronenbourg_tourtel.off.json @@ -27,7 +27,7 @@ "nutriscore_grade_producer" : "D", "org_name" : "BRASSERIES KRONENBOURG", "origin_fr" : "67210 Obernai France", - "packaging" : "Bouteille", + "packaging" : "en:bottle", "product_name_fr" : "TOURTEL - 27,5CL TTCITR BIO FR-BIO-01 - 0.00 DEGRE ALCOOL", "proteins_100g_unit" : "g", "proteins_100g_value" : "0.2", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_knorr_child_item.off.json b/tests/unit/expected_test_results/import_gs1/equadis_knorr_child_item.off.json index 4fad8e65a5403..11682d956b852 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_knorr_child_item.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_knorr_child_item.off.json @@ -37,7 +37,7 @@ "lc" : "fr", "net_weight" : "0.0690 kg", "org_name" : "UNILEVER FRANCE GMS", - "packaging" : "Poche", + "packaging" : "en:bag", "preparation_fr" : " Pour préparer 2 bols généreux1. Faites bouillir 700 ml d'eau. Ramenez à feu doux puis versez le contenu du sachet en remuant avec une cuillère en bois. 2. Laissez mijoter 5 minutes en remuant de temps en temps. Inutile de saler.", "producer_version_id" : "SERVING_SIZE_CU KN DSO SICHUAN 69G POU FR", "product_name_fr" : "Knorr Soupe Déshydratée Chinoise Nouilles et Champignons Noirs Sachet 69g 2 Portions", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_knorr_pesto_rouge.off.json b/tests/unit/expected_test_results/import_gs1/equadis_knorr_pesto_rouge.off.json index a32c4d6dcb3f0..0648ba5ba8e08 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_knorr_pesto_rouge.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_knorr_pesto_rouge.off.json @@ -14,7 +14,7 @@ "lc" : "fr", "net_weight" : "0.34 kg", "org_name" : "UNILEVER FRANCE RHD", - "packaging" : "Film", + "packaging" : "en:film", "preparation_fr" : "Préparation :\nMise en Place s'utilise en mélange à chaud comme à froid, pour des applications multiples.\nMise en place contenant déjà du sel, saler la préparation en dernier lieu si nécessaire.\nPour préparer une sauce au pesto rouge, ajouter de l'huile d'olive et du parmesan.\nSuggestions de recettes :\nConvient parfaitement pour l'assaisonnement de pâtes, sauces tomate, poissons, viandes, marinades, carpaccios, vinaigrettes et sauces pour\nPanini.", "producer_version_id" : "Knorr ROUGE Pâte aux Aromates 340 GR", "product_name_fr" : "Knorr Mise en place pesto rouge Pot 340g", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_nestle_france_ricore.off.json b/tests/unit/expected_test_results/import_gs1/equadis_nestle_france_ricore.off.json index 6fa418de539e5..4dff70ed84712 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_nestle_france_ricore.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_nestle_france_ricore.off.json @@ -41,7 +41,7 @@ "nutriscore_grade_producer" : "D", "org_name" : "NESTLE FRANCE (DIV CHOC,CUL,BI,INF)", "origin_fr" : "Fabriqué en France. Café Origine non-UE. Chicorée Origine UE et Non-UE. Lait origine France. La majorité de notre chicorée vient du Nord de la France.", - "packaging" : "Canette", + "packaging" : "en:can", "preparation_fr" : "4 cuillères + 200 ml d'eau frémissante.", "producer_version_id" : "44041392", "product_name_fr" : "RICORE au Lait, Boîte de 400g", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_panzani_container.off.json b/tests/unit/expected_test_results/import_gs1/equadis_panzani_container.off.json index bb02d727c2a3b..f9df6ef555d5b 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_panzani_container.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_panzani_container.off.json @@ -26,7 +26,7 @@ "monounsaturated-fat_100g_value" : "0", "net_weight" : "3.2 kg", "org_name" : "PANZANI SA", - "packaging" : "Conteneur", + "packaging" : "en:container", "preparation_fr" : "Plongez les pâtes dans de l'eau bouillante salée à votre convenance (1L pour 100g de pâtes) et laissez cuire le temps indiqué sur le devant du paquet.", "product_name_fr" : "TAGLIATELLE QUALITE PATES FRAICHES", "proteins_100g_unit" : "g", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_pepsico_chips.off.json b/tests/unit/expected_test_results/import_gs1/equadis_pepsico_chips.off.json index 8bdbf47bd30c6..110dfba624a39 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_pepsico_chips.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_pepsico_chips.off.json @@ -37,7 +37,7 @@ "net_weight" : "0.264 kg", "nutriscore_grade_producer" : "C", "org_name" : "PEPSICO FRANCE", - "packaging" : "Sac", + "packaging" : "en:bag", "preparation_fr" : "A consommer sans préparation", "product_name_fr" : "Lay's saveur sel & vinaigre 240 g + 10% offert", "proteins_100g_unit" : "g", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_100g_plus_100ml_nutrition.off.json b/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_100g_plus_100ml_nutrition.off.json index 413c5fbc98a2d..5f1db043bc871 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_100g_plus_100ml_nutrition.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_100g_plus_100ml_nutrition.off.json @@ -34,7 +34,7 @@ "net_weight" : "0.3 kg", "nutriscore_grade_producer" : "D", "org_name" : "UNILEVER FRANCE GMS", - "packaging" : "Film", + "packaging" : "en:film", "preparation_fr" : "Sortez vos glaces du congélateur quelques minutes avant de les déguster, afin de profiter pleinement de leurs saveurs.", "producer_version_id" : "Magnum DOUBLE RASPEBBRY Bâtonnets de Glace 360 ML", "product_name_fr" : "Magnum Glace Batonnet Mini Double Framboise 6x60ml", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_mini_ruby.off.json b/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_mini_ruby.off.json index 82d2afac480f0..e774c6365f4f6 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_mini_ruby.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_unilever_magnum_mini_ruby.off.json @@ -34,7 +34,7 @@ "net_weight" : "0.258 kg", "nutriscore_grade_producer" : "D", "org_name" : "UNILEVER FRANCE GMS", - "packaging" : "Film", + "packaging" : "en:film", "preparation_fr" : "Pour une dégustation encore plus onctueuse de votre crème glacée, sortez-le quelques minutes avant de servir.", "producer_version_id" : "Magnum RUBY Bâtonnets de Glace 330 ML", "product_name_fr" : "Magnum Glace Bâtonnet Mini Ruby x6 330ml", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_unilever_moutarde_maille.off.json b/tests/unit/expected_test_results/import_gs1/equadis_unilever_moutarde_maille.off.json index 5264dc528227c..010067ff11590 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_unilever_moutarde_maille.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_unilever_moutarde_maille.off.json @@ -32,7 +32,7 @@ "lc" : "fr", "net_weight" : "0.34 kg", "org_name" : "UNILEVER FRANCE GMS", - "packaging" : "Film", + "packaging" : "en:film", "periods_after_opening" : "1 month", "preparation_fr" : ".", "producer_version_id" : "MAILLE FIN GOURMET Moutarde 375 ML", diff --git a/tests/unit/expected_test_results/import_gs1/equadis_unilever_obf_signal.off.json b/tests/unit/expected_test_results/import_gs1/equadis_unilever_obf_signal.off.json index 3019df55f6f83..0b07500c65412 100644 --- a/tests/unit/expected_test_results/import_gs1/equadis_unilever_obf_signal.off.json +++ b/tests/unit/expected_test_results/import_gs1/equadis_unilever_obf_signal.off.json @@ -15,7 +15,7 @@ "lc" : "fr", "net_weight" : "0.095 kg", "org_name" : "UNILEVER FRANCE GMS", - "packaging" : "Tube", + "packaging" : "en:tube", "periods_after_opening" : "12 month", "preparation_fr" : "Brossez-vous les dents en effectuant de petits mouvements de haut en bas – du rose des gencives vers le blanc des dents - et en prenant soin de passer sur toute la surface de chaque dent. Prenez votre temps : il faut compter au moins deux minutes, voire trois, pour un brossage efficace. Terminez votre rituel de soins dentaire par un bain de bouche Signal pour une hygiène optimale.", "producer_version_id" : "Signal CLAY&CHARCOAL DENTIFRICE 75 ML", From e7acb51cc4bb9eb0666152bf91603a936ba9404b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Wed, 8 Feb 2023 10:28:26 +0100 Subject: [PATCH 8/8] small fix --- lib/ProductOpener/ImportConvert.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ProductOpener/ImportConvert.pm b/lib/ProductOpener/ImportConvert.pm index edb3c0acfacb9..1a0059a0b34b4 100644 --- a/lib/ProductOpener/ImportConvert.pm +++ b/lib/ProductOpener/ImportConvert.pm @@ -852,7 +852,7 @@ sub clean_fields ($product_ref) { $brand =~ s/^\s+//; $brand =~ s/\s+$//; # we may get brands with quantifiers like * + ? etc. we need to escape them - $brand =~ s/(\*|\+|\?|\(|\)|\[|\]|\{|\}|\$|\^|\\)/\\/g; + $brand =~ s/(\*|\+|\?|\(|\)|\[|\]|\{|\}|\$|\^|\\)/\\$1/g; # dashes/dots/spaces -> allow matching dashes/dot/spaces # e.g. "bons.mayennais" matches "bons mayennais"