openfoodfacts · stephanegigandet · Mar 16, 2023 · Mar 16, 2023 · Mar 16, 2023 · Mar 16, 2023
diff --git a/html/images/lang/sv/a-market.141x90.png → ...images/lang/sv/labels/ä-märket.141x90.png b/html/images/lang/sv/a-market.141x90.png → ...images/lang/sv/labels/ä-märket.141x90.png
@@ -158,6 +158,11 @@ use ProductOpener::Config2;
 		unaccent => 1,
 		lowercase => 1,
 	},
+	# xx: language less entries, also deaccent
+	xx => {
+		unaccent => 1,
+		lowercase => 1,
+	},
 );
 
 %admins = map {$_ => 1} qw(

@@ -179,6 +179,7 @@ use File::Copy;
 use MIME::Base64 qw(encode_base64);
 use POSIX qw(strftime);
 use LWP::UserAgent ();
+use Encode;
 
 use GraphViz2;
 use JSON::PP;
@@ -447,12 +448,15 @@ sub load_tags_images ($lc, $tagtype) {
 	defined $tags_images{$lc}{$tagtype} or $tags_images{$lc}{$tagtype} = {};
 
 	if (opendir(DH2, "$www_root/images/lang/$lc/$tagtype")) {
-		foreach my $file (readdir(DH2)) {
+		foreach my $file (sort readdir(DH2)) {
+			# Note: readdir returns bytes, which may be utf8 on some systems
+			# see https://perldoc.perl.org/perlunicode#When-Unicode-Does-Not-Happen
+			$file = decode('utf8', $file);
 			if ($file =~ /^((.*)\.\d+x${logo_height}.(png|svg))$/) {
 				if ((not defined $tags_images{$lc}{$tagtype}{$2}) or ($3 eq 'svg')) {
 					$tags_images{$lc}{$tagtype}{$2} = $1;
-					# print STDERR "load_tags_images - tags_images - lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
-					# print "load_tags_images - tags_images - loading lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
+					print STDERR
+						"load_tags_images - tags_images - loading lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
 				}
 			}
 		}
@@ -686,6 +690,14 @@ sub get_from_cache ($tagtype, @files) {
 	my $tag_www_root = "$www_root/data/taxonomies/$tagtype";
 
 	my $sha1 = Digest::SHA1->new;
+
+	# Add a version string to the taxonomy data
+	# Change this version string if you want to force the taxonomies to be rebuilt
+	# e.g. if the taxonomy building algorithm or configuration has changed
+	# This needs to be done also when the unaccenting parameters for languages set in Config.pm are changed
+
+	$sha1->add("20230316 - made xx: unaccented");
+
 	foreach my $source_file (@files) {
 		open(my $IN, "<", "$data_root/taxonomies/$source_file.txt")
 			or die("Cannot open $data_root/taxonomies/$source_file.txt : $!\n");
@@ -2128,18 +2140,18 @@ sub country_to_cc ($country) {
 # load all tags images
 
 # print STDERR "Tags.pm - loading tags images\n";
-if (opendir my $DH2, $lang_dir) {
-	foreach my $langid (readdir($DH2)) {
+if (opendir my $DH2, "$www_root/images/lang") {
+	foreach my $langid (sort readdir($DH2)) {
 		next if $langid eq '.';
 		next if $langid eq '..';
-		# print STDERR "Tags.pm - reading tagtypes for lang $langid\n";
+		print STDERR "Tags.pm - reading tagtypes for lang $langid\n";
 		next if ((length($langid) ne 2) and not($langid eq 'other'));
 
 		if (-e "$www_root/images/lang/$langid") {
 			opendir my $DH, "$www_root/images/lang/$langid" or die "Couldn't open the current directory: $!";
-			foreach my $tagtype (readdir($DH)) {
+			foreach my $tagtype (sort readdir($DH)) {
 				next if $tagtype =~ /\./;
-				# print STDERR "Tags: loading tagtype images $langid/$tagtype\n";
+				print STDERR "Tags: loading tagtype images $langid/$tagtype\n";
 				# print "Tags: loading tagtype images $langid/$tagtype\n";
 				load_tags_images($langid, $tagtype);
 			}
@@ -2773,7 +2785,7 @@ If an image is associated to a tag, return its relative url, otherwise return un
 =head4 $target_lc
 
 The desired language for the image. If an image is not available in the target language,
-it can be returned in the tag language, or in English.
+it can be returned in English or in the tag language.
 
 =head4 $tagtype
 
@@ -2785,40 +2797,30 @@ The type of the tag (e.g. categories, labels, allergens)
 
 sub get_tag_image ($target_lc, $tagtype, $canon_tagid) {
 
-	my $img;
-
-	my $target_title = display_taxonomy_tag($target_lc, $tagtype, $canon_tagid);
-
-	my $img_lc = $target_lc;
-
-	my $lc_imgid = get_string_id_for_lang($target_lc, $target_title);
-	my $en_imgid = get_taxonomyid("en", $canon_tagid);
-	my $tag_lc = undef;
-	if ($en_imgid =~ /^(\w\w):/) {
-		$en_imgid = $';
-		$tag_lc = $1;
+	# Build an ordered list of languages that the image can be in
+	my @languages = ($target_lc, "xx", "en");
+	if ($canon_tagid =~ /^(\w\w):/) {
+		push @languages, $1;
 	}
 
-	if (defined $tags_images{$target_lc}{$tagtype}{$lc_imgid}) {
-		$img = $tags_images{$target_lc}{$tagtype}{$lc_imgid};
-	}
-	elsif ( (defined $tag_lc)
-		and (defined $tags_images{$tag_lc})
-		and (defined $tags_images{$tag_lc}{$tagtype}{$en_imgid}))
-	{
-		$img = $tags_images{$tag_lc}{$tagtype}{$en_imgid};
-		$img_lc = $tag_lc;
-	}
-	elsif (defined $tags_images{'en'}{$tagtype}{$en_imgid}) {
-		$img = $tags_images{'en'}{$tagtype}{$en_imgid};
-		$img_lc = 'en';
-	}
+	# Record which language we tested, as the list can contain the same language multiple times
+	my %seen_lc = ();
 
-	if ($img) {
-		$img = "/images/lang/${img_lc}/$tagtype/" . $img;
+	foreach my $l (@languages) {
+		next if defined $seen_lc{$l};
+		$seen_lc{$l} = 1;
+		my $translation = display_taxonomy_tag($l, $tagtype, $canon_tagid);
+		# Support both unaccented and possibly deaccented image file name
+		foreach
+			my $imgid (get_string_id_for_lang("no_language", $translation), get_string_id_for_lang($l, $translation))
+		{
+			if (defined $tags_images{$l}{$tagtype}{$imgid}) {
+				return "/images/lang/$l/$tagtype/" . $tags_images{$l}{$tagtype}{$imgid};
+			}
+		}
 	}
 
-	return $img;
+	return;
 }
 
 =head2 display_tags_hierarchy_taxonomy ( $target_lc, $tagtype, $tags_ref )
@@ -2989,6 +2991,8 @@ sub get_taxonomyurl ($tag_lc, $tagid) {
 	}
 }
 
+# Return the canonical id of a tag string in a specific language
+
 sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {
 
 	if (not defined $tag) {
@@ -3009,6 +3013,8 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {
 		return $weblink_tag;
 	}
 
+	# If we are passed a tag string that starts with a language code (e.g. fr:café)
+	# override the input language
 	if ($tag =~ /^(\w\w):/) {
 		$tag_lc = $1;
 		$tag = $';
@@ -3106,17 +3112,20 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {
 
 				next if ($test_lc eq $tag_lc);
 
+				# get a tagid with the unaccenting rules for the language we are trying to match
+				my $test_lc_tagid = get_string_id_for_lang($test_lc, $tag);
+
 				if (    (defined $synonyms{$tagtype})
 					and (defined $synonyms{$tagtype}{$test_lc})
-					and (defined $synonyms{$tagtype}{$test_lc}{$tagid}))
+					and (defined $synonyms{$tagtype}{$test_lc}{$test_lc_tagid}))
 				{
-					$tagid = $synonyms{$tagtype}{$test_lc}{$tagid};
+					$tagid = $synonyms{$tagtype}{$test_lc}{$test_lc_tagid};
 					$tag_lc = $test_lc;
 				}
 				else {
 
 					# try removing stopwords and plurals
-					my $tagid2 = remove_stopwords($tagtype, $test_lc, $tagid);
+					my $tagid2 = remove_stopwords($tagtype, $test_lc, $test_lc_tagid);
 					$tagid2 = remove_plurals($test_lc, $tagid2);
 					if (    (defined $synonyms{$tagtype})
 						and (defined $synonyms{$tagtype}{$test_lc})

diff --git a/taxonomies/test.txt b/taxonomies/test.txt
@@ -94,3 +94,7 @@ de:Special value for German 2
 
 xx:Language-less entry
 de:Special value for German 3
+
+# xx: entry with accents, need to match unaccented version
+sv:Ä-märket
+xx:Ä-märket
@@ -1,7 +1,6 @@
 #!/usr/bin/perl -w
 
-use Modern::Perl '2017';
-use utf8;
+use ProductOpener::PerlStandards;
 
 use Test::More;
 #use Log::Any::Adapter 'TAP', filter => "none";
@@ -673,4 +672,27 @@ is(
 	"fr:un label français inconnu, Ecológico, en:A New English label, Missing language prefix, Comercio justo, en:one-percent-for-the-planet"
 );
 
+is(canonicalize_taxonomy_tag('fr', 'categories', 'café'), "en:coffees");
+
+# Tests to verify we match the xx:Ä Märket entry
+is(canonicalize_taxonomy_tag('sv', 'test', 'A Market'), "sv:ä-märket");    # matches the xx: entry which is unaccented
+is(canonicalize_taxonomy_tag('sv', 'test', 'Ä Märket'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('en', 'test', 'Ä Märket'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('en', 'test', 'A-MArket'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('en', 'test', 'en:Ä Märket'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('en', 'test', 'en:A MArket'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('en', 'test', 'en:a-market'), "sv:ä-märket");
+is(canonicalize_taxonomy_tag('de', 'test', 'Ä Märket'), "sv:ä-märket")
+	;    # no unaccent in German, but need to deaccent to match the xx: entry
+
+is(display_taxonomy_tag("fr", "test", "sv:ä-märket"), "Ä-märket");
+
+# Tags images
+is(get_tag_image("en", "labels", "en:usda-organic"), "/images/lang/en/labels/usda-organic.90x90.svg");
+is(get_tag_image("sv", "labels", "sv:ä-märket"), "/images/lang/sv/labels/ä-märket.141x90.png");  # file name is accented
+is(get_tag_image("fr", "labels", "fr:commerce-equitable"), "/images/lang/fr/labels/commerce-equitable.96x90.png")
+	;    # file name is unaccented, unaccented language
+is(get_tag_image("fr", "labels", "fi:sydänmerkki"), "/images/lang/fi/labels/sydanmerkki.90x90.png")
+	;    # file name is unaccented, accented language
+
 done_testing();