Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: support for accented language less taxonomy entries + images #8218

Merged
merged 6 commits into from
Mar 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lib/ProductOpener/Config_off.pm
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ use ProductOpener::Config2;
unaccent => 1,
lowercase => 1,
},
# xx: language less entries, also deaccent
xx => {
unaccent => 1,
lowercase => 1,
},
);

%admins = map {$_ => 1} qw(
Expand Down
91 changes: 50 additions & 41 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ use File::Copy;
use MIME::Base64 qw(encode_base64);
use POSIX qw(strftime);
use LWP::UserAgent ();
use Encode;

use GraphViz2;
use JSON::PP;
Expand Down Expand Up @@ -447,12 +448,15 @@ sub load_tags_images ($lc, $tagtype) {
defined $tags_images{$lc}{$tagtype} or $tags_images{$lc}{$tagtype} = {};

if (opendir(DH2, "$www_root/images/lang/$lc/$tagtype")) {
foreach my $file (readdir(DH2)) {
foreach my $file (sort readdir(DH2)) {
# Note: readdir returns bytes, which may be utf8 on some systems
# see https://perldoc.perl.org/perlunicode#When-Unicode-Does-Not-Happen
$file = decode('utf8', $file);
if ($file =~ /^((.*)\.\d+x${logo_height}.(png|svg))$/) {
if ((not defined $tags_images{$lc}{$tagtype}{$2}) or ($3 eq 'svg')) {
$tags_images{$lc}{$tagtype}{$2} = $1;
# print STDERR "load_tags_images - tags_images - lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
# print "load_tags_images - tags_images - loading lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
print STDERR
"load_tags_images - tags_images - loading lc: $lc - tagtype: $tagtype - tag: $2 - img: $1 - ext: $3 \n";
}
}
}
Expand Down Expand Up @@ -686,6 +690,14 @@ sub get_from_cache ($tagtype, @files) {
my $tag_www_root = "$www_root/data/taxonomies/$tagtype";

my $sha1 = Digest::SHA1->new;

# Add a version string to the taxonomy data
# Change this version string if you want to force the taxonomies to be rebuilt
# e.g. if the taxonomy building algorithm or configuration has changed
# This needs to be done also when the unaccenting parameters for languages set in Config.pm are changed

$sha1->add("20230316 - made xx: unaccented");
Comment on lines +694 to +699
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very good idea :-)


foreach my $source_file (@files) {
open(my $IN, "<", "$data_root/taxonomies/$source_file.txt")
or die("Cannot open $data_root/taxonomies/$source_file.txt : $!\n");
Expand Down Expand Up @@ -2128,18 +2140,18 @@ sub country_to_cc ($country) {
# load all tags images

# print STDERR "Tags.pm - loading tags images\n";
if (opendir my $DH2, $lang_dir) {
foreach my $langid (readdir($DH2)) {
if (opendir my $DH2, "$www_root/images/lang") {
foreach my $langid (sort readdir($DH2)) {
next if $langid eq '.';
next if $langid eq '..';
# print STDERR "Tags.pm - reading tagtypes for lang $langid\n";
print STDERR "Tags.pm - reading tagtypes for lang $langid\n";
next if ((length($langid) ne 2) and not($langid eq 'other'));

if (-e "$www_root/images/lang/$langid") {
opendir my $DH, "$www_root/images/lang/$langid" or die "Couldn't open the current directory: $!";
foreach my $tagtype (readdir($DH)) {
foreach my $tagtype (sort readdir($DH)) {
next if $tagtype =~ /\./;
# print STDERR "Tags: loading tagtype images $langid/$tagtype\n";
print STDERR "Tags: loading tagtype images $langid/$tagtype\n";
# print "Tags: loading tagtype images $langid/$tagtype\n";
load_tags_images($langid, $tagtype);
}
Expand Down Expand Up @@ -2773,7 +2785,7 @@ If an image is associated to a tag, return its relative url, otherwise return un
=head4 $target_lc

The desired language for the image. If an image is not available in the target language,
it can be returned in the tag language, or in English.
it can be returned in English or in the tag language.

=head4 $tagtype

Expand All @@ -2785,40 +2797,30 @@ The type of the tag (e.g. categories, labels, allergens)

sub get_tag_image ($target_lc, $tagtype, $canon_tagid) {

my $img;

my $target_title = display_taxonomy_tag($target_lc, $tagtype, $canon_tagid);

my $img_lc = $target_lc;

my $lc_imgid = get_string_id_for_lang($target_lc, $target_title);
my $en_imgid = get_taxonomyid("en", $canon_tagid);
my $tag_lc = undef;
if ($en_imgid =~ /^(\w\w):/) {
$en_imgid = $';
$tag_lc = $1;
# Build an ordered list of languages that the image can be in
my @languages = ($target_lc, "xx", "en");
if ($canon_tagid =~ /^(\w\w):/) {
push @languages, $1;
}

if (defined $tags_images{$target_lc}{$tagtype}{$lc_imgid}) {
$img = $tags_images{$target_lc}{$tagtype}{$lc_imgid};
}
elsif ( (defined $tag_lc)
and (defined $tags_images{$tag_lc})
and (defined $tags_images{$tag_lc}{$tagtype}{$en_imgid}))
{
$img = $tags_images{$tag_lc}{$tagtype}{$en_imgid};
$img_lc = $tag_lc;
}
elsif (defined $tags_images{'en'}{$tagtype}{$en_imgid}) {
$img = $tags_images{'en'}{$tagtype}{$en_imgid};
$img_lc = 'en';
}
# Record which language we tested, as the list can contain the same language multiple times
my %seen_lc = ();

if ($img) {
$img = "/images/lang/${img_lc}/$tagtype/" . $img;
foreach my $l (@languages) {
next if defined $seen_lc{$l};
$seen_lc{$l} = 1;
my $translation = display_taxonomy_tag($l, $tagtype, $canon_tagid);
# Support both unaccented and possibly deaccented image file name
foreach
my $imgid (get_string_id_for_lang("no_language", $translation), get_string_id_for_lang($l, $translation))
{
if (defined $tags_images{$l}{$tagtype}{$imgid}) {
return "/images/lang/$l/$tagtype/" . $tags_images{$l}{$tagtype}{$imgid};
}
}
}

return $img;
return;
}

=head2 display_tags_hierarchy_taxonomy ( $target_lc, $tagtype, $tags_ref )
Expand Down Expand Up @@ -2989,6 +2991,8 @@ sub get_taxonomyurl ($tag_lc, $tagid) {
}
}

# Return the canonical id of a tag string in a specific language

sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {

if (not defined $tag) {
Expand All @@ -3009,6 +3013,8 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {
return $weblink_tag;
}

# If we are passed a tag string that starts with a language code (e.g. fr:café)
# override the input language
if ($tag =~ /^(\w\w):/) {
$tag_lc = $1;
$tag = $';
Expand Down Expand Up @@ -3106,17 +3112,20 @@ sub canonicalize_taxonomy_tag ($tag_lc, $tagtype, $tag) {

next if ($test_lc eq $tag_lc);

# get a tagid with the unaccenting rules for the language we are trying to match
my $test_lc_tagid = get_string_id_for_lang($test_lc, $tag);

if ( (defined $synonyms{$tagtype})
and (defined $synonyms{$tagtype}{$test_lc})
and (defined $synonyms{$tagtype}{$test_lc}{$tagid}))
and (defined $synonyms{$tagtype}{$test_lc}{$test_lc_tagid}))
{
$tagid = $synonyms{$tagtype}{$test_lc}{$tagid};
$tagid = $synonyms{$tagtype}{$test_lc}{$test_lc_tagid};
$tag_lc = $test_lc;
}
else {

# try removing stopwords and plurals
my $tagid2 = remove_stopwords($tagtype, $test_lc, $tagid);
my $tagid2 = remove_stopwords($tagtype, $test_lc, $test_lc_tagid);
$tagid2 = remove_plurals($test_lc, $tagid2);
if ( (defined $synonyms{$tagtype})
and (defined $synonyms{$tagtype}{$test_lc})
Expand Down
4 changes: 4 additions & 0 deletions taxonomies/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,7 @@ de:Special value for German 2

xx:Language-less entry
de:Special value for German 3

# xx: entry with accents, need to match unaccented version
sv:Ä-märket
xx:Ä-märket
26 changes: 24 additions & 2 deletions tests/unit/tags.t
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/perl -w

use Modern::Perl '2017';
use utf8;
use ProductOpener::PerlStandards;

use Test::More;
#use Log::Any::Adapter 'TAP', filter => "none";
Expand Down Expand Up @@ -673,4 +672,27 @@ is(
"fr:un label français inconnu, Ecológico, en:A New English label, Missing language prefix, Comercio justo, en:one-percent-for-the-planet"
);

is(canonicalize_taxonomy_tag('fr', 'categories', 'café'), "en:coffees");

# Tests to verify we match the xx:Ä Märket entry
is(canonicalize_taxonomy_tag('sv', 'test', 'A Market'), "sv:ä-märket"); # matches the xx: entry which is unaccented
is(canonicalize_taxonomy_tag('sv', 'test', 'Ä Märket'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('en', 'test', 'Ä Märket'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('en', 'test', 'A-MArket'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('en', 'test', 'en:Ä Märket'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('en', 'test', 'en:A MArket'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('en', 'test', 'en:a-market'), "sv:ä-märket");
is(canonicalize_taxonomy_tag('de', 'test', 'Ä Märket'), "sv:ä-märket")
; # no unaccent in German, but need to deaccent to match the xx: entry

is(display_taxonomy_tag("fr", "test", "sv:ä-märket"), "Ä-märket");

# Tags images
is(get_tag_image("en", "labels", "en:usda-organic"), "/images/lang/en/labels/usda-organic.90x90.svg");
is(get_tag_image("sv", "labels", "sv:ä-märket"), "/images/lang/sv/labels/ä-märket.141x90.png"); # file name is accented
is(get_tag_image("fr", "labels", "fr:commerce-equitable"), "/images/lang/fr/labels/commerce-equitable.96x90.png")
; # file name is unaccented, unaccented language
is(get_tag_image("fr", "labels", "fi:sydänmerkki"), "/images/lang/fi/labels/sydanmerkki.90x90.png")
; # file name is unaccented, accented language

done_testing();