From 380579508dcaaf7c52173dc4f7e035f5cc5ce9ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Fri, 31 Mar 2023 16:36:24 +0200 Subject: [PATCH 1/7] feat: separate products_obsolete MongoDB collection for obsolete products #8078 --- cgi/product_multilingual.pl | 4 + lib/ProductOpener/Config2_sample.pm | 2 +- lib/ProductOpener/Data.pm | 50 ++++++++-- lib/ProductOpener/Display.pm | 46 +++++---- lib/ProductOpener/Products.pm | 53 +++++++++-- scripts/export_database.pl | 2 +- scripts/export_products_data_and_images.pl | 3 +- scripts/fix_non_normalized_codes.pl | 6 +- scripts/gen_packaging_stats.pl | 2 +- scripts/gen_top_tags_per_country.pl | 4 +- ...roducts_to_products_obsolete_collection.pl | 94 +++++++++++++++++++ scripts/update_all_products.pl | 2 +- 12 files changed, 226 insertions(+), 42 deletions(-) create mode 100644 scripts/move_obsolete_products_to_products_obsolete_collection.pl diff --git a/cgi/product_multilingual.pl b/cgi/product_multilingual.pl index 7ee9f7d124efd..3877846eb5b66 100755 --- a/cgi/product_multilingual.pl +++ b/cgi/product_multilingual.pl @@ -614,7 +614,11 @@ ($product_ref) # Obsolete products + # We test if the "obsolete_since_date" field is present, as the checkbox field won't be sent if the box is unchecked if (($User{moderator} or $Owner_id) and (defined single_param('obsolete_since_date'))) { + # We need to temporarily record if the product was obsolete, so that we can remove it + # from the product or product_obsolete collection if its obsolete status changed + $product_ref->{was_obsolete} = $product_ref->{obsolete}; $product_ref->{obsolete} = remove_tags_and_quote(decode utf8 => single_param("obsolete")); $product_ref->{obsolete_since_date} = remove_tags_and_quote(decode utf8 => single_param("obsolete_since_date")); } diff --git a/lib/ProductOpener/Config2_sample.pm b/lib/ProductOpener/Config2_sample.pm index c086f4af49c52..8dc64184c2034 100644 --- a/lib/ProductOpener/Config2_sample.pm +++ b/lib/ProductOpener/Config2_sample.pm @@ -63,7 +63,7 @@ $data_root = "/home/off"; $geolite2_path = '/usr/local/share/GeoLite2-Country/GeoLite2-Country.mmdb'; -$mongodb = "off"; +$mongodb = "off"; # MongoDB database name $mongodb_host = "mongodb://localhost"; $mongodb_timeout_ms = 50000; # config option max_time_ms/maxTimeMS diff --git a/lib/ProductOpener/Data.pm b/lib/ProductOpener/Data.pm index 74d9193cc76bf..0e52c104d7a21 100644 --- a/lib/ProductOpener/Data.pm +++ b/lib/ProductOpener/Data.pm @@ -39,6 +39,8 @@ improve performance of aggregate queries for an improved user experience and mor collection was initially proposed in L on GitHub, where some additional context is available. +Obsolete products that have been withdrawn from the market have separate collections: products_obsolete and products_obsolete_tags + =cut package ProductOpener::Data; @@ -53,7 +55,6 @@ BEGIN { &get_database &get_collection &get_products_collection - &get_products_tags_collection &get_emb_codes_collection &get_recent_changes_collection &remove_documents_by_ids @@ -115,15 +116,38 @@ sub execute_query ($sub) { )->run(); } -=head2 get_products_collection() +=head2 get_products_collection( $options_ref ) C establishes a connection to MongoDB and uses timeout as an argument. This then selects a collection from within the database. =head3 Arguments -This method takes in arguments of integer type (user defined timeout in milliseconds). -It is optional for this subroutine to have an argument. +This method takes parameters in an optional hash reference with the following keys: + +=head4 database MongoDB database name + +Defaults to $ProductOpener::Config::mongodb + +This is useful when moving products to another flavour +(e.g. from Open Food Facts (database: off) to Open Beauty Facts (database: obf)) + +=head4 timeout User defined timeout in milliseconds + +=head4 obsolete + +If set to a true value, the function returns a collection that contains only obsolete products, +otherwise it returns the collection with products that are not obsolete. + +=head4 tags + +If set to a true value, the function may return a smaller collection that contains only the *_tags fields, +in order to speed aggregate queries. The smaller collection is created every night, +and may therefore contain slightly stale data. + +As of 2023/03/13, we return the products_tags collection for non obsolete products. +For obsolete products, we currently return the products_obsolete collection, but we might +create a separate products_obsolete_tags collection in the future, if it becomes necessary to create one. =head3 Return values @@ -131,8 +155,18 @@ Returns a mongoDB collection object. =cut -sub get_products_collection ($timeout = undef) { - return get_collection($mongodb, 'products', $timeout); +sub get_products_collection ($options_ref = {}) { + my $database = $options_ref->{database} // $mongodb; + my $collection = 'products'; + if ($options_ref->{obsolete}) { + $collection .= '_obsolete'; + } + # We don't have a products_obsolete_tags collection at this point + # if it changes, the following elsif should be changed to a if + elsif ($options_ref->{tags}) { + $collection .= '_tags'; + } + return get_collection($database, $collection, $options_ref->{timeout}); } =head2 get_products_tags_collection() @@ -150,10 +184,6 @@ Returns a mongoDB collection. =cut -sub get_products_tags_collection ($timeout = undef) { - return get_collection($mongodb, 'products_tags', $timeout); -} - sub get_emb_codes_collection ($timeout = undef) { return get_collection($mongodb, 'emb_codes', $timeout); } diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm index a68c95ccaa5f3..9bd2f652bbdc6 100644 --- a/lib/ProductOpener/Display.pm +++ b/lib/ProductOpener/Display.pm @@ -1444,7 +1444,8 @@ sub query_list_of_tags ($request_ref, $query_ref) { if $log->is_debug(); $results = execute_query( sub { - return get_products_collection()->aggregate($aggregate_parameters, {allowDiskUse => 1}); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->aggregate($aggregate_parameters, {allowDiskUse => 1}); } ); }; @@ -1457,7 +1458,8 @@ sub query_list_of_tags ($request_ref, $query_ref) { if $log->is_debug(); $results = execute_query( sub { - return get_products_tags_collection()->aggregate($aggregate_parameters, {allowDiskUse => 1}); + return get_products_collection({obsolete => request_param($request_ref, "obsolete"), tags => 1}) + ->aggregate($aggregate_parameters, {allowDiskUse => 1}); } ); }; @@ -1524,7 +1526,7 @@ sub query_list_of_tags ($request_ref, $query_ref) { if $log->is_debug(); $count_results = execute_query( sub { - return get_products_collection() + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) ->aggregate($aggregate_count_parameters, {allowDiskUse => 1}); } ); @@ -1537,7 +1539,8 @@ sub query_list_of_tags ($request_ref, $query_ref) { if $log->is_debug(); $count_results = execute_query( sub { - return get_products_tags_collection() + return get_products_collection( + {obsolete => request_param($request_ref, "obsolete"), tags => 1}) ->aggregate($aggregate_count_parameters, {allowDiskUse => 1}); } ); @@ -4372,7 +4375,8 @@ sub count_products ($request_ref, $query_ref) { $log->debug("Counting MongoDB documents for query", {query => $query_ref}) if $log->is_debug(); $count = execute_query( sub { - return get_products_collection()->count_documents($query_ref); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->count_documents($query_ref); } ); }; @@ -4891,7 +4895,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa $log->debug("Counting MongoDB documents for query", {query => $query_ref}) if $log->is_debug(); $count = execute_query( sub { - return get_products_tags_collection()->count_documents($query_ref); + return get_products_collection({obsolete => request_param($request_ref, "obsolete"), tags => 1}) + ->count_documents($query_ref); } ); $log->info("MongoDB count query ok", {error => $@, count => $count}) if $log->is_info(); @@ -4901,7 +4906,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa $log->debug("Executing MongoDB query", {query => $aggregate_parameters}) if $log->is_debug(); $cursor = execute_query( sub { - return get_products_tags_collection()->aggregate($aggregate_parameters, {allowDiskUse => 1}); + return get_products_collection({obsolete => request_param($request_ref, "obsolete"), tags => 1}) + ->aggregate($aggregate_parameters, {allowDiskUse => 1}); } ); } @@ -4951,7 +4957,9 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa $log->debug("count_documents on smaller products_tags collection", {key => $key_count}) if $log->is_debug(); - return get_products_tags_collection()->count_documents($query_ref); + return get_products_collection( + {obsolete => request_param($request_ref, "obsolete"), tags => 1}) + ->count_documents($query_ref); } ); @@ -4962,7 +4970,9 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa sub { $log->debug("count_documents on complete products collection", {key => $key_count}) if $log->is_debug(); - return get_products_collection()->count_documents($query_ref); + return get_products_collection( + {obsolete => request_param($request_ref, "obsolete")}) + ->count_documents($query_ref); } ); } @@ -4988,7 +4998,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa sub { $log->debug("empty query_ref, use estimated_document_count fot better performance", {}) if $log->is_debug(); - return get_products_collection()->estimated_document_count(); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->estimated_document_count(); } ); } @@ -4999,8 +5010,8 @@ sub search_and_display_products ($request_ref, $query_ref, $sort_by, $limit, $pa if $log->is_debug(); $cursor = execute_query( sub { - return get_products_collection()->query($query_ref)->fields($fields_ref)->sort($sort_ref) - ->limit($limit)->skip($skip); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->query($query_ref)->fields($fields_ref)->sort($sort_ref)->limit($limit)->skip($skip); } ); $log->info("MongoDB query ok", {error => $@}) if $log->is_info(); @@ -6361,7 +6372,8 @@ sub search_and_graph_products ($request_ref, $query_ref, $graph_ref) { eval { $cursor = execute_query( sub { - return get_products_collection()->query($query_ref)->fields($fields_ref); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->query($query_ref)->fields($fields_ref); } ); }; @@ -6493,7 +6505,8 @@ sub search_and_map_products ($request_ref, $query_ref, $graph_ref) { eval { $cursor = execute_query( sub { - return get_products_collection()->query($query_ref)->fields( + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->query($query_ref)->fields( { code => 1, lc => 1, @@ -6505,7 +6518,7 @@ sub search_and_map_products ($request_ref, $query_ref, $graph_ref) { origins => 1, emb_codes_tags => 1, } - ); + ); } ); }; @@ -10970,7 +10983,8 @@ sub search_and_analyze_recipes ($request_ref, $query_ref) { eval { $cursor = execute_query( sub { - return get_products_collection()->query($query_ref)->fields($fields_ref); + return get_products_collection({obsolete => request_param($request_ref, "obsolete")}) + ->query($query_ref)->fields($fields_ref); } ); }; diff --git a/lib/ProductOpener/Products.pm b/lib/ProductOpener/Products.pm index 2c72ccd4c0ec9..19865d0b9a40a 100644 --- a/lib/ProductOpener/Products.pm +++ b/lib/ProductOpener/Products.pm @@ -1001,6 +1001,16 @@ sub compute_sort_keys ($product_ref) { return; } +=head2 store_product ($user_id, $product_ref, $comment) + +Save changes of a product: +- in a new .sto file on the disk +- in MongoDB (in the products collection, or products_obsolete collection if the product is obsolete) + +Before saving, some field values are computed, and product history and completeness is computed. + +=cut + sub store_product ($user_id, $product_ref, $comment) { my $code = $product_ref->{code}; @@ -1008,7 +1018,15 @@ sub store_product ($user_id, $product_ref, $comment) { my $path = product_path($product_ref); my $rev = $product_ref->{rev}; - $log->debug("store_product - start", {code => $code, product_id => $product_id}) if $log->is_debug(); + $log->debug( + "store_product - start", + { + code => $code, + product_id => $product_id, + obsolete => $product_ref->{obsolete}, + was_obsolete => $product_ref->{was_obsolete} + } + ) if $log->is_debug(); # In case we need to move a product from OFF to OBF etc. # the "new_server" value will be set to off, obf etc. @@ -1021,8 +1039,29 @@ sub store_product ($user_id, $product_ref, $comment) { my $new_data_root = $data_root; my $new_www_root = $www_root; - my $products_collection = get_products_collection(); - my $new_products_collection = $products_collection; + # We use the was_obsolete flag so that we can remove the product from its old collection + # (either products or products_obsolete) if its obsolete status has changed + my $previous_products_collection = get_products_collection({obsolete => $product_ref->{was_obsolete}}); + my $new_products_collection = get_products_collection({obsolete => $product_ref->{obsolete}}); + + # the obsolete (and was_obsolete) field is either undef or an empty string, or contains "on" + if ( ($product_ref->{was_obsolete} and not $product_ref->{obsolete}) + or (not $product_ref->{was_obsolete} and $product_ref->{obsolete})) + { + # The obsolete status changed, we need to remove the product from its previous collection + $log->debug( + "obsolete status changed", + { + code => $code, + product_id => $product_id, + obsolete => $product_ref->{obsolete}, + was_obsolete => $product_ref->{was_obsolete}, + previous_products_collection => $previous_products_collection + } + ) if $log->is_debug(); + $previous_products_collection->delete_one({"_id" => $product_ref->{_id}}); + } + delete $product_ref->{was_obsolete}; if ( (defined $product_ref->{server}) and (defined $options{other_servers}) @@ -1031,7 +1070,8 @@ sub store_product ($user_id, $product_ref, $comment) { my $server = $product_ref->{server}; $new_data_root = $options{other_servers}{$server}{data_root}; $new_www_root = $options{other_servers}{$server}{www_root}; - $new_products_collection = get_collection($options{other_servers}{$server}{mongodb}, 'products'); + $new_products_collection = get_products_collection( + {database => $options{other_servers}{$server}{mongodb}, obsolete => $product_ref->{obsolete}}); } if (defined $product_ref->{old_code}) { @@ -1043,7 +1083,8 @@ sub store_product ($user_id, $product_ref, $comment) { my $new_server = $product_ref->{new_server}; $new_data_root = $options{other_servers}{$new_server}{data_root}; $new_www_root = $options{other_servers}{$new_server}{www_root}; - $new_products_collection = get_collection($options{other_servers}{$new_server}{mongodb}, 'products'); + $new_products_collection = get_products_collection( + {database => $options{other_servers}{$new_server}{mongodb}, obsolete => $product_ref->{obsolete}}); $product_ref->{server} = $product_ref->{new_server}; delete $product_ref->{new_server}; } @@ -1113,7 +1154,7 @@ sub store_product ($user_id, $product_ref, $comment) { execute_query( sub { - return $products_collection->delete_one({"_id" => $product_ref->{_id}}); + return $previous_products_collection->delete_one({"_id" => $product_ref->{_id}}); } ); diff --git a/scripts/export_database.pl b/scripts/export_database.pl index 008c32ef23ca4..6a6efafb44bcd 100755 --- a/scripts/export_database.pl +++ b/scripts/export_database.pl @@ -144,7 +144,7 @@ sub sanitize_field_content { # 300 000 ms timeout so that we can export the whole database # 5mins is not enough, 50k docs were exported - my $cursor = get_products_collection(3 * 60 * 60 * 1000)->query( + my $cursor = get_products_collection({timeout => 3 * 60 * 60 * 1000})->query( { 'code' => {"\$ne" => ""}, 'empty' => {"\$ne" => 1} diff --git a/scripts/export_products_data_and_images.pl b/scripts/export_products_data_and_images.pl index 7c23e5ec3a800..50c0001981053 100755 --- a/scripts/export_products_data_and_images.pl +++ b/scripts/export_products_data_and_images.pl @@ -139,7 +139,8 @@ print STDERR "MongoDB query:\n" . Dumper($query_ref) . "\n"; # harvest products'code from mongo db -my $cursor = get_products_collection(3 * 60 * 60 * 1000)->query($query_ref)->fields({"code" => 1})->sort({code => 1}); +my $cursor = get_products_collection({timeout => 3 * 60 * 60 * 1000})->query($query_ref)->fields({"code" => 1}) + ->sort({code => 1}); $cursor->immortal(1); diff --git a/scripts/fix_non_normalized_codes.pl b/scripts/fix_non_normalized_codes.pl index 844d26f1ac428..863b1733b0099 100755 --- a/scripts/fix_non_normalized_codes.pl +++ b/scripts/fix_non_normalized_codes.pl @@ -123,7 +123,7 @@ () # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. my $socket_timeout_ms = 2 * 60000; - my $products_collection = get_products_collection($socket_timeout_ms); + my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); # find int codes my @int_ids = (); @@ -194,7 +194,7 @@ ($dry_run, $out) # remove them from mongodb # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. my $socket_timeout_ms = 2 * 60000; - my $products_collection = get_products_collection($socket_timeout_ms); + my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); $products_collection->delete_many($int_codes_query_ref); return; @@ -211,7 +211,7 @@ ($dry_run, $out) my @ids_to_remove = (); # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. my $socket_timeout_ms = 2 * 60000; - my $products_collection = get_products_collection($socket_timeout_ms); + my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); my $cursor = $products_collection->query({})->fields({_id => 1, code => 1}); $cursor->immortal(1); while (my $product_ref = $cursor->next) { diff --git a/scripts/gen_packaging_stats.pl b/scripts/gen_packaging_stats.pl index 2f41b8c1bf161..eec59207bb546 100755 --- a/scripts/gen_packaging_stats.pl +++ b/scripts/gen_packaging_stats.pl @@ -351,7 +351,7 @@ ($name, $query_ref) }; my $socket_timeout_ms = 3 * 60 * 60 * 60000; # 3 hours - my $products_collection = get_products_collection($socket_timeout_ms); + my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); my $products_count = $products_collection->count_documents($query_ref); diff --git a/scripts/gen_top_tags_per_country.pl b/scripts/gen_top_tags_per_country.pl index 45a3ce81d4f2c..0f31ab5f3b2d7 100755 --- a/scripts/gen_top_tags_per_country.pl +++ b/scripts/gen_top_tags_per_country.pl @@ -158,8 +158,8 @@ # 300 000 ms timeout so that we can export the whole database # 5mins is not enough, 50k docs were exported -my $cursor = get_products_collection(3 * 60 * 60 * 1000)->query({'empty' => {"\$ne" => 1}, 'obsolete' => {"\$ne" => 1}}) - ->sort({created_t => 1})->fields($fields_ref); +my $cursor = get_products_collection({timeout => 3 * 60 * 60 * 1000}) + ->query({'empty' => {"\$ne" => 1}, 'obsolete' => {"\$ne" => 1}})->sort({created_t => 1})->fields($fields_ref); $cursor->immortal(1); diff --git a/scripts/move_obsolete_products_to_products_obsolete_collection.pl b/scripts/move_obsolete_products_to_products_obsolete_collection.pl new file mode 100644 index 0000000000000..bf6c33a0343cd --- /dev/null +++ b/scripts/move_obsolete_products_to_products_obsolete_collection.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w + +# This file is part of Product Opener. +# +# Product Opener +# Copyright (C) 2011-2023 Association Open Food Facts +# Contact: contact@openfoodfacts.org +# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France +# +# Product Opener is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +use Modern::Perl '2017'; +use utf8; + +use ProductOpener::Config qw/:all/; +use ProductOpener::Store qw/:all/; +use ProductOpener::Products qw/:all/; +use ProductOpener::Data qw/:all/; + +use Log::Any::Adapter 'TAP'; + +my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. +my $products_collection = get_products_collection({timeout=>$socket_timeout_ms}); +my $obsolete_products_collection = get_products_collection({obsolete=>1, timeout=>$socket_timeout_ms}); + +my $products_count = ""; + +my $query_ref => {obsolete => 'on'}; + +eval { + $products_count = $products_collection->count_documents($query_ref)); + + print STDERR "$products_count documents to update.\n"; +}; + +$cursor = $products_collection->query($query_ref)->fields({_id => 1, code => 1, owner => 1}); + +$cursor->immortal(1); + +my $n = 0; # number of products updated +} + +while (my $product_ref = $cursor->next) { + + my $productid = $product_ref->{_id}; + my $code = $product_ref->{code}; + my $path = product_path($product_ref); + + my $owner_info = ""; + if (defined $product_ref->{owner}) { + $owner_info = "- owner: " . $product_ref->{owner} . " "; + } + + if (not defined $code) { + print STDERR "code field undefined for product id: " + . $product_ref->{id} + . " _id: " + . $product_ref->{_id} . "\n"; + } + else { + print STDERR "updating product code: $code $owner_info ($n / $products_count)\n"; + } + + $product_ref = retrieve_product($productid); + + if ((defined $product_ref) and ($productid ne '')) { + + $product_ref->{_id} .= ''; + $product_ref->{code} .= ''; + $products_collection->delete_one({"_id" => $product_ref->{_id}}); + $obsolete_products_collection->replace_one({"_id" => $product_ref->{_id}}, $product_ref, {upsert => 1}); + + $n++; + } + else { + print STDERR "Unable to load product file for product code $code\n"; + } + +} + +print "$n products updated\n"; + +exit(0); diff --git a/scripts/update_all_products.pl b/scripts/update_all_products.pl index e7b4b5fc04a58..40c9ee2b9da68 100755 --- a/scripts/update_all_products.pl +++ b/scripts/update_all_products.pl @@ -349,7 +349,7 @@ print STDERR "MongoDB query:\n" . Dumper($query_ref); my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. -my $products_collection = get_products_collection($socket_timeout_ms); +my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); my $products_count = ""; From 3ef8fef0d41721a2ef3a55fdc8e98c2605a9f2cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Fri, 31 Mar 2023 17:43:14 +0200 Subject: [PATCH 2/7] fix script --- ...roducts_to_products_obsolete_collection.pl | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) mode change 100644 => 100755 scripts/move_obsolete_products_to_products_obsolete_collection.pl diff --git a/scripts/move_obsolete_products_to_products_obsolete_collection.pl b/scripts/move_obsolete_products_to_products_obsolete_collection.pl old mode 100644 new mode 100755 index bf6c33a0343cd..d7fa8107aa6b5 --- a/scripts/move_obsolete_products_to_products_obsolete_collection.pl +++ b/scripts/move_obsolete_products_to_products_obsolete_collection.pl @@ -31,25 +31,24 @@ use Log::Any::Adapter 'TAP'; my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy. -my $products_collection = get_products_collection({timeout=>$socket_timeout_ms}); -my $obsolete_products_collection = get_products_collection({obsolete=>1, timeout=>$socket_timeout_ms}); +my $products_collection = get_products_collection({timeout => $socket_timeout_ms}); +my $obsolete_products_collection = get_products_collection({obsolete => 1, timeout => $socket_timeout_ms}); my $products_count = ""; -my $query_ref => {obsolete => 'on'}; +my $query_ref = {obsolete => 'on'}; eval { - $products_count = $products_collection->count_documents($query_ref)); + $products_count = $products_collection->count_documents($query_ref); print STDERR "$products_count documents to update.\n"; }; -$cursor = $products_collection->query($query_ref)->fields({_id => 1, code => 1, owner => 1}); +my $cursor = $products_collection->query($query_ref)->fields({_id => 1, code => 1, owner => 1}); $cursor->immortal(1); my $n = 0; # number of products updated -} while (my $product_ref = $cursor->next) { @@ -72,14 +71,14 @@ print STDERR "updating product code: $code $owner_info ($n / $products_count)\n"; } - $product_ref = retrieve_product($productid); + $product_ref = retrieve_product($productid); if ((defined $product_ref) and ($productid ne '')) { - $product_ref->{_id} .= ''; - $product_ref->{code} .= ''; - $products_collection->delete_one({"_id" => $product_ref->{_id}}); - $obsolete_products_collection->replace_one({"_id" => $product_ref->{_id}}, $product_ref, {upsert => 1}); + $product_ref->{_id} .= ''; + $product_ref->{code} .= ''; + $products_collection->delete_one({"_id" => $product_ref->{_id}}); + $obsolete_products_collection->replace_one({"_id" => $product_ref->{_id}}, $product_ref, {upsert => 1}); $n++; } From 3a4bfbb043d0d4db3b4e73b3ccb3d619e2691327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Mon, 3 Apr 2023 18:34:16 +0200 Subject: [PATCH 3/7] also export obsolete products in CSV --- scripts/export_database.pl | 361 +++++++++++++++++++------------------ 1 file changed, 185 insertions(+), 176 deletions(-) diff --git a/scripts/export_database.pl b/scripts/export_database.pl index 6a6efafb44bcd..06066d23a1948 100755 --- a/scripts/export_database.pl +++ b/scripts/export_database.pl @@ -142,17 +142,6 @@ sub sanitize_field_content { $lc = $l; $lang = $l; - # 300 000 ms timeout so that we can export the whole database - # 5mins is not enough, 50k docs were exported - my $cursor = get_products_collection({timeout => 3 * 60 * 60 * 1000})->query( - { - 'code' => {"\$ne" => ""}, - 'empty' => {"\$ne" => 1} - } - )->fields($fields_ref)->sort({code => 1}); - - $cursor->immortal(1); - $langs{$l} = 0; my $csv_filename = "$www_root/data/$lang.$server_domain.products.csv"; @@ -279,231 +268,251 @@ sub sanitize_field_content { $csv =~ s/\t$/\n/; print $OUT $csv; - # Products + # Get products from the products collection, plus the products_obsolete collection + my @collections = ( + get_products_collection({timeout => 3 * 60 * 60 * 1000}), + get_products_collection({obsolete => 1, timeout => 3 * 60 * 60 * 1000}) + ); + my $count = 0; my %ingredients = (); - my $ct = 0; - while (my $product_ref = $cursor->next) { + foreach my $collection (@collections) { + + # 300 000 ms timeout so that we can export the whole database + # 5mins is not enough, 50k docs were exported + my $cursor = $collection->query( + { + 'code' => {"\$ne" => ""}, + 'empty' => {"\$ne" => 1} + } + )->fields($fields_ref)->sort({code => 1}); + + $cursor->immortal(1); + + while (my $product_ref = $cursor->next) { - my $csv = ''; - my $url = "http://world-$lc.$server_domain" . product_url($product_ref); - my $code = ($product_ref->{code} // ''); + my $csv = ''; + my $url = "http://world-$lc.$server_domain" . product_url($product_ref); + my $code = ($product_ref->{code} // ''); - $code eq '' and next; - $code < 1 and next; + $code eq '' and next; + $code < 1 and next; - $ct++; - print "$ct \n" if ($ct % 1000 == 0); # print number of products each 1000 + $count++; + print "$count \n" if ($count % 1000 == 0); # print number of products each 1000 - foreach my $field (@export_fields) { + foreach my $field (@export_fields) { - my $field_value; + my $field_value; - # _tags field contain an array of values - if ($field =~ /_tags/) { - if (defined $product_ref->{$field}) { - $field_value = join(',', @{$product_ref->{$field}}); + # _tags field contain an array of values + if ($field =~ /_tags/) { + if (defined $product_ref->{$field}) { + $field_value = join(',', @{$product_ref->{$field}}); + } + else { + $field_value = ""; + } } + # other fields else { - $field_value = ""; + $field_value = ($product_ref->{$field} // ""); } - } - # other fields - else { - $field_value = ($product_ref->{$field} // ""); - } - # Language specific field? - if ( (defined $language_fields{$field}) - and (defined $product_ref->{$field . "_" . $l}) - and ($product_ref->{$field . "_" . $l} ne '')) - { - $field_value = $product_ref->{$field . "_" . $l}; - } - - # Eco-Score data is stored in ecoscore_data.(grades|scores).(language code) - if (($field =~ /^ecoscore_(score|grade)_(\w\w)/) and (defined $product_ref->{ecoscore_data})) { - $field_value = ($product_ref->{ecoscore_data}{$1 . "s"}{$2} // ""); - } + # Language specific field? + if ( (defined $language_fields{$field}) + and (defined $product_ref->{$field . "_" . $l}) + and ($product_ref->{$field . "_" . $l} ne '')) + { + $field_value = $product_ref->{$field . "_" . $l}; + } - if ($field_value ne '') { - $field_value = sanitize_field_content($field_value, $BAD, "$code barcode -> field $field:"); - } + # Eco-Score data is stored in ecoscore_data.(grades|scores).(language code) + if (($field =~ /^ecoscore_(score|grade)_(\w\w)/) and (defined $product_ref->{ecoscore_data})) { + $field_value = ($product_ref->{ecoscore_data}{$1 . "s"}{$2} // ""); + } - # Add field value to CSV file - $csv .= $field_value . "\t"; + if ($field_value ne '') { + $field_value = sanitize_field_content($field_value, $BAD, "$code barcode -> field $field:"); + } - # If current field is "code", add the product url after it; example: - # 9542013592 http://world-fr.openfoodfacts.org/produit/0009542013592/gourmet-truffles-lindt - if ($field eq 'code') { - $csv .= $url . "\t"; - } + # Add field value to CSV file + $csv .= $field_value . "\t"; - # If the field name ending with _t (ie a date in epoch format), add - # a field in ISO 8601 date format; example: - # created_t created_datetime - # 1489061370 2017-03-09T12:09:30Z - if ($field =~ /_t$/) { - if (defined $product_ref->{$field} && $product_ref->{$field} > 0) { - # surprisingly slow, approx 10% of script time is here. - #my $dt = DateTime->from_epoch( epoch => $product_ref->{$field} ); - #$csv .= $dt->datetime() . 'Z' . "\t"; - my $dt = strftime("%FT%TZ", gmtime($product_ref->{$field})); - $csv .= $dt . "\t"; - } - else { - $csv .= "\t"; + # If current field is "code", add the product url after it; example: + # 9542013592 http://world-fr.openfoodfacts.org/produit/0009542013592/gourmet-truffles-lindt + if ($field eq 'code') { + $csv .= $url . "\t"; } - } - if (defined $tags_fields{$field}) { - if (defined $product_ref->{$field . '_tags'}) { - $csv .= join(',', @{$product_ref->{$field . '_tags'}}) . "\t"; - } - else { - $csv .= "\t"; + # If the field name ending with _t (ie a date in epoch format), add + # a field in ISO 8601 date format; example: + # created_t created_datetime + # 1489061370 2017-03-09T12:09:30Z + if ($field =~ /_t$/) { + if (defined $product_ref->{$field} && $product_ref->{$field} > 0) { + # surprisingly slow, approx 10% of script time is here. + #my $dt = DateTime->from_epoch( epoch => $product_ref->{$field} ); + #$csv .= $dt->datetime() . 'Z' . "\t"; + my $dt = strftime("%FT%TZ", gmtime($product_ref->{$field})); + $csv .= $dt . "\t"; + } + else { + $csv .= "\t"; + } } - } - if (defined $taxonomy_fields{$field}) { - if (defined $product_ref->{$field . '_tags'}) { - $csv .= join(',', map {display_taxonomy_tag($lc, $field, $_)} @{$product_ref->{$field . '_tags'}}) - . "\t"; + + if (defined $tags_fields{$field}) { + if (defined $product_ref->{$field . '_tags'}) { + $csv .= join(',', @{$product_ref->{$field . '_tags'}}) . "\t"; + } + else { + $csv .= "\t"; + } } - else { - $csv .= "\t"; + if (defined $taxonomy_fields{$field}) { + if (defined $product_ref->{$field . '_tags'}) { + $csv + .= join(',', + map {display_taxonomy_tag($lc, $field, $_)} @{$product_ref->{$field . '_tags'}}) + . "\t"; + } + else { + $csv .= "\t"; + } } - } - if ($field eq 'emb_codes') { - # take the first emb code - my $geo = ''; - if (defined $product_ref->{"emb_codes_tags"}[0]) { - my $emb_code = $product_ref->{"emb_codes_tags"}[0]; - my $city_code = get_city_code($emb_code); - if (defined $emb_codes_geo{$city_code}) { - $geo = $emb_codes_geo{$city_code}[0] . ',' . $emb_codes_geo{$city_code}[1]; + if ($field eq 'emb_codes') { + # take the first emb code + my $geo = ''; + if (defined $product_ref->{"emb_codes_tags"}[0]) { + my $emb_code = $product_ref->{"emb_codes_tags"}[0]; + my $city_code = get_city_code($emb_code); + if (defined $emb_codes_geo{$city_code}) { + $geo = $emb_codes_geo{$city_code}[0] . ',' . $emb_codes_geo{$city_code}[1]; + } } - } - # sanitize_field_content($field_value, $log_file, $log_msg); - if ($geo ne '') { - $geo = sanitize_field_content($geo, $BAD, "$code barcode -> field $field:"); + # sanitize_field_content($field_value, $log_file, $log_msg); + if ($geo ne '') { + $geo = sanitize_field_content($geo, $BAD, "$code barcode -> field $field:"); + } + + $csv .= $geo . "\t"; } - $csv .= $geo . "\t"; } - } - - # "main" category: lowest level category + # "main" category: lowest level category - my $main_cid = ''; - my $main_cid_lc = ''; + my $main_cid = ''; + my $main_cid_lc = ''; - if ((defined $product_ref->{categories_tags}) and (scalar @{$product_ref->{categories_tags}} > 0)) { + if ((defined $product_ref->{categories_tags}) and (scalar @{$product_ref->{categories_tags}} > 0)) { - $main_cid = $product_ref->{categories_tags}[(scalar @{$product_ref->{categories_tags}}) - 1]; + $main_cid = $product_ref->{categories_tags}[(scalar @{$product_ref->{categories_tags}}) - 1]; - $main_cid = canonicalize_tag2("categories", $main_cid); - $main_cid_lc = display_taxonomy_tag($lc, 'categories', $main_cid); - } + $main_cid = canonicalize_tag2("categories", $main_cid); + $main_cid_lc = display_taxonomy_tag($lc, 'categories', $main_cid); + } - $csv .= $main_cid . "\t"; - $csv .= $main_cid_lc . "\t"; + $csv .= $main_cid . "\t"; + $csv .= $main_cid_lc . "\t"; - $product_ref->{main_category} = $main_cid; + $product_ref->{main_category} = $main_cid; - add_images_urls_to_product($product_ref, $l); + add_images_urls_to_product($product_ref, $l); - $csv .= ($product_ref->{image_url} // "") . "\t" . ($product_ref->{image_small_url} // "") . "\t"; - $csv .= ($product_ref->{image_ingredients_url} // "") . "\t" - . ($product_ref->{image_ingredients_small_url} // "") . "\t"; - $csv .= ($product_ref->{image_nutrition_url} // "") . "\t" - . ($product_ref->{image_nutrition_small_url} // "") . "\t"; + $csv .= ($product_ref->{image_url} // "") . "\t" . ($product_ref->{image_small_url} // "") . "\t"; + $csv .= ($product_ref->{image_ingredients_url} // "") . "\t" + . ($product_ref->{image_ingredients_small_url} // "") . "\t"; + $csv .= ($product_ref->{image_nutrition_url} // "") . "\t" + . ($product_ref->{image_nutrition_small_url} // "") . "\t"; - foreach my $nid (@nutrients_to_export) { + foreach my $nid (@nutrients_to_export) { - if (defined $product_ref->{nutriments}{$nid . "_100g"}) { - my $value = $product_ref->{nutriments}{$nid . "_100g"}; - if ($value =~ /e/) { - # 7e-05 1.71e-06 - $value = sprintf("%.10f", $value); - # Remove trailing 0s - $value =~ s/\.([0-9]+?)0*$/\.$1/g; + if (defined $product_ref->{nutriments}{$nid . "_100g"}) { + my $value = $product_ref->{nutriments}{$nid . "_100g"}; + if ($value =~ /e/) { + # 7e-05 1.71e-06 + $value = sprintf("%.10f", $value); + # Remove trailing 0s + $value =~ s/\.([0-9]+?)0*$/\.$1/g; + } + $csv .= $value . "\t"; + } + else { + $csv .= "\t"; } - $csv .= $value . "\t"; - } - else { - $csv .= "\t"; } - } - #$csv =~ s/\t$/\n/; - if (substr($csv, -1, 1) eq "\t") { - substr $csv, -1, 1, "\n"; - } + #$csv =~ s/\t$/\n/; + if (substr($csv, -1, 1) eq "\t") { + substr $csv, -1, 1, "\n"; + } - my $name = xml_escape_NFC($product_ref->{product_name}); - my $ingredients_text = xml_escape_NFC($product_ref->{ingredients_text}); + my $name = xml_escape_NFC($product_ref->{product_name}); + my $ingredients_text = xml_escape_NFC($product_ref->{ingredients_text}); - my $rdf = < $code $name ${ingredients_text} XML - ; + ; + + if (defined $product_ref->{ingredients}) { + + foreach my $i (@{$product_ref->{ingredients}}) { + + # Encode URI + my $ing_encoded = URI::Escape::XS::encodeURIComponent($i->{id}); + $rdf + .= "\t\n" + . "\t\t\n" + . "\t\t\t\n"; + not defined $ingredients{$i->{id}} and $ingredients{$i->{id}} = {}; + $ingredients{$i->{id}}{ucfirst($i->{text})}++; + if (defined $i->{rank}) { + $rdf .= "\t\t\t" . $i->{rank} . "\n"; + } + if (defined $i->{percent}) { + $rdf .= "\t\t\t" . $i->{percent} . "\n"; + } + $rdf .= "\t\t\n"; + $rdf .= "\t\n"; - if (defined $product_ref->{ingredients}) { - - foreach my $i (@{$product_ref->{ingredients}}) { - - # Encode URI - my $ing_encoded = URI::Escape::XS::encodeURIComponent($i->{id}); - $rdf - .= "\t\n" - . "\t\t\n" - . "\t\t\t\n"; - not defined $ingredients{$i->{id}} and $ingredients{$i->{id}} = {}; - $ingredients{$i->{id}}{ucfirst($i->{text})}++; - if (defined $i->{rank}) { - $rdf .= "\t\t\t" . $i->{rank} . "\n"; - } - if (defined $i->{percent}) { - $rdf .= "\t\t\t" . $i->{percent} . "\n"; } - $rdf .= "\t\t\n"; - $rdf .= "\t\n"; - } - } - foreach my $nutrient_tagid (sort(get_all_taxonomy_entries("nutrients"))) { + foreach my $nutrient_tagid (sort(get_all_taxonomy_entries("nutrients"))) { - my $nid = $nutrient_tagid; - $nid =~ s/^zz://g; + my $nid = $nutrient_tagid; + $nid =~ s/^zz://g; - if ( (defined $product_ref->{nutriments}{$nid . '_100g'}) - and ($product_ref->{nutriments}{$nid . '_100g'} ne '')) - { - my $property = $nid; - $property =~ s/-([a-z])/ucfirst($1)/eg; - $property .= "Per100g"; + if ( (defined $product_ref->{nutriments}{$nid . '_100g'}) + and ($product_ref->{nutriments}{$nid . '_100g'} ne '')) + { + my $property = $nid; + $property =~ s/-([a-z])/ucfirst($1)/eg; + $property .= "Per100g"; - $rdf .= "\t" . $product_ref->{nutriments}{$nid . '_100g'} . "\n"; + $rdf .= "\t" . $product_ref->{nutriments}{$nid . '_100g'} . "\n"; + } } - } - $rdf .= "\n\n"; + $rdf .= "\n\n"; - print $OUT $csv; - print $RDF $rdf; + print $OUT $csv; + print $RDF $rdf; + } } - $langs{$l} = $ct; - $total += $ct; + $langs{$l} = $count; + $total += $count; close $OUT; close $BAD; From d2392dc8321b4bbd2f3cb459504cf5540013ac9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Mon, 3 Apr 2023 19:29:57 +0200 Subject: [PATCH 4/7] stopwords --- stop_words.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/stop_words.txt b/stop_words.txt index a05a3e4df634c..62198f62bb215 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -13,6 +13,7 @@ api appid aromatisées arôme +autocomplete backticks barcode barcodes @@ -39,6 +40,8 @@ csv CSV d'acérola dans +dataset +datasets de dont ecoscore @@ -52,6 +55,7 @@ eval EXIF Fabriqué FILEHANDLE +flavour flavouring flavourings Folksonomy @@ -118,6 +122,8 @@ nutri Nutri nutriscore Nutriscore +obf +off openfoodfacts OpenFoodFacts Origine From 12be46dd1c24511bdd659fb666b525de05ce60c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Tue, 4 Apr 2023 15:42:34 +0200 Subject: [PATCH 5/7] weighers --- stop_words.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stop_words.txt b/stop_words.txt index 8f8d92328917f..15f003e1d4a2d 100644 --- a/stop_words.txt +++ b/stop_words.txt @@ -190,5 +190,7 @@ Valeur viande vitamines VPF +weigher +weighers www xml From 24bdfa2ca530eef6c29b19eee9dc53e19c096e84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Tue, 4 Apr 2023 17:37:39 +0200 Subject: [PATCH 6/7] Update scripts/move_obsolete_products_to_products_obsolete_collection.pl Co-authored-by: Alex Garel --- .../move_obsolete_products_to_products_obsolete_collection.pl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/move_obsolete_products_to_products_obsolete_collection.pl b/scripts/move_obsolete_products_to_products_obsolete_collection.pl index d7fa8107aa6b5..267ed834fdf6f 100755 --- a/scripts/move_obsolete_products_to_products_obsolete_collection.pl +++ b/scripts/move_obsolete_products_to_products_obsolete_collection.pl @@ -20,8 +20,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -use Modern::Perl '2017'; -use utf8; +use ProductOpener::PerlStandards; use ProductOpener::Config qw/:all/; use ProductOpener::Store qw/:all/; From 0ee9ca3a738e6527d664d60f31e7ba8ad693531e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Gigandet?= Date: Wed, 5 Apr 2023 10:30:40 +0200 Subject: [PATCH 7/7] suggestions from code review --- lib/ProductOpener/Food.pm | 3 ++- lib/ProductOpener/Products.pm | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/ProductOpener/Food.pm b/lib/ProductOpener/Food.pm index db0cec8ae9622..f65a42475c899 100644 --- a/lib/ProductOpener/Food.pm +++ b/lib/ProductOpener/Food.pm @@ -1662,7 +1662,8 @@ sub create_nutrients_level_taxonomy() { my ($nid, $low, $high) = @{$nutrient_level_ref}; foreach my $level ('low', 'moderate', 'high') { $nutrient_levels_taxonomy - .= "\n" . 'en:' + .= "\n" + . 'en:' . sprintf( $Lang{nutrient_in_quantity}{en}, display_taxonomy_tag("en", "nutrients", "zz:$nid"), diff --git a/lib/ProductOpener/Products.pm b/lib/ProductOpener/Products.pm index 19865d0b9a40a..8ac37853f92bc 100644 --- a/lib/ProductOpener/Products.pm +++ b/lib/ProductOpener/Products.pm @@ -1043,6 +1043,7 @@ sub store_product ($user_id, $product_ref, $comment) { # (either products or products_obsolete) if its obsolete status has changed my $previous_products_collection = get_products_collection({obsolete => $product_ref->{was_obsolete}}); my $new_products_collection = get_products_collection({obsolete => $product_ref->{obsolete}}); + my $delete_from_previous_products_collection = 0; # the obsolete (and was_obsolete) field is either undef or an empty string, or contains "on" if ( ($product_ref->{was_obsolete} and not $product_ref->{obsolete}) @@ -1059,7 +1060,7 @@ sub store_product ($user_id, $product_ref, $comment) { previous_products_collection => $previous_products_collection } ) if $log->is_debug(); - $previous_products_collection->delete_one({"_id" => $product_ref->{_id}}); + $delete_from_previous_products_collection = 1; } delete $product_ref->{was_obsolete}; @@ -1303,6 +1304,10 @@ sub store_product ($user_id, $product_ref, $comment) { #return 0; } + # First store the product data in a .sto file on disk + store("$new_data_root/products/$path/$rev.sto", $product_ref); + + # Also store the product in MongoDB, unless it was marked as deleted if ($product_ref->{deleted}) { $new_products_collection->delete_one({"_id" => $product_ref->{_id}}); } @@ -1310,7 +1315,11 @@ sub store_product ($user_id, $product_ref, $comment) { $new_products_collection->replace_one({"_id" => $product_ref->{_id}}, $product_ref, {upsert => 1}); } - store("$new_data_root/products/$path/$rev.sto", $product_ref); + # product that has a changed obsolete status + if ($delete_from_previous_products_collection) { + $previous_products_collection->delete_one({"_id" => $product_ref->{_id}}); + } + # Update link my $link = "$new_data_root/products/$path/product.sto"; if (-l $link) {