Skip to content

Commit

Permalink
chore: script to preprocess intermarche packaging excel file (#8431)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Alex Garel <alex@garel.org>
  • Loading branch information
stephanegigandet and alexgarel authored Jun 5, 2023
1 parent 454f851 commit 7f96648
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 1 deletion.
111 changes: 111 additions & 0 deletions scripts/preprocess_intermarche_packaging_csv_file.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/perl -w

# This file is part of Product Opener.
#
# Product Opener
# Copyright (C) 2011-2023 Association Open Food Facts
# Contact: contact@openfoodfacts.org
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
#
# Product Opener is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

=head1 NAME
=head1 DESCRIPTION
The Excel file that contains packaging data for Intermarché / Les Mousquetaires products
has 1 line for each product / packaging component / packaging material combination.
Packaging components can have multiple materials (e.g. a lid may be 95% steel and 5% plastic),
which is currently not supported in the OFF packagings structure.
This script filters the CSV file corresponding to the Excel file in order to keep only
the dominant material of each packaging component.
=cut

use ProductOpener::PerlStandards;

use Text::CSV;
use Data::DeepAccess qw(deep_get deep_set deep_exists);

my $separator = "\t";

my $csv_options_ref
= {binary => 1, sep_char => $separator, eol => "\n", quote_space => 0}; # should set binary attribute.

my $csv_file = $ARGV[0];

open my $in, "<:encoding(utf8)", $csv_file or die "cannot read $csv_file: $!";

$csv_file =~ s/\.csv$/.preprocessed.csv/;

open my $out, ">:encoding(utf8)", $csv_file or die "cannot write $csv_file: $!";

my $csv = Text::CSV->new($csv_options_ref)
or die("Cannot use CSV: " . Text::CSV->error_diag());

my $header_row_ref = $csv->getline($in);

# We will create a hash with the hashing components
my %packaging_components = ();

while (my $row_ref = $csv->getline($in)) {

# Columns names:
# n°CDC 1= CDC à conserver N/A = CDC d'un frn en partage de marché à ne pas conserver
# Marque
# Nom produit
# GTIN
# Usage de l'emballage
# Type d'emballage
# Poids unitaire vide en gramme
# Nombre d’unités de composants
# Matériau
# % en poids

# Build a key to identify unique packaging components
# We start with barcode, as we will sort on this key in output
# and want to keep one product's packaging components together
my $key = $row_ref->[4] # barcode
. " - " . $row_ref->[6] # shape
. " - " . $row_ref->[7] # total weight of unit
. " - " . $row_ref->[8] # number of units
;

my $material = $row_ref->[9];
my $percent_weight = $row_ref->[10] || 0; # some rows have no values for the weight
$percent_weight =~ s/\%//;

deep_set(\%packaging_components, ($key, $material), [$percent_weight, $row_ref]);
}

close $in;

# Output the packaging components, keeping only the material with the highest percent

$csv->print($out, $header_row_ref);

foreach my $key (sort keys %packaging_components) {
# we are considering a unique packaging component
# sort on percent_weight for each material
my @materials = sort {$packaging_components{$key}{$b}[0] <=> $packaging_components{$key}{$a}[0]}
keys %{$packaging_components{$key}};

# only keep the most relevant material for sake of simplicity
$csv->print($out, $packaging_components{$key}{$materials[0]}[1]);
}

close $out;
4 changes: 3 additions & 1 deletion stop_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ http
https
incrontab
Ingrédients
Intermarché
ip
IPs
iso
Expand All @@ -126,6 +127,7 @@ matche
md
métal
mongoDB
Mousquetaires
msgctxt
multiline
Multiline
Expand Down Expand Up @@ -217,4 +219,4 @@ weigher
weighers
www
xml
gzipped
gzipped

0 comments on commit 7f96648

Please sign in to comment.