Skip to content

Commit

Permalink
#15 Invalid unicode codepoint warning
Browse files Browse the repository at this point in the history
Use "utf-8-strict" rather than "utf8" when decoding filenames when efs is in play.
That traps invalid code points immediately.
  • Loading branch information
pmqs committed Mar 25, 2024
1 parent adb9b9b commit f52cb23
Show file tree
Hide file tree
Showing 8 changed files with 168 additions and 23 deletions.
38 changes: 25 additions & 13 deletions bin/zipdetails
Original file line number Diff line number Diff line change
Expand Up @@ -2427,18 +2427,29 @@ sub validateFilename

# Portability (mostly with Windows)

# see https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file
state $badDosFilename = join '|', map { quotemeta }
qw(CON PRN AUX NUL
COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9
LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9
) ;
return "Portability Issue: '$1' is a reserved Windows device name"
if $filename =~ /^($badDosFilename)$/io ;

# Can't have the device name with an extension either
return "Portability Issue: '$1' is a reserved Windows device name"
if $filename =~ /^($badDosFilename)\./io ;
{
# see https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file
state $badDosFilename = join '|', map { quotemeta }
qw(CON PRN AUX NUL
COM1 COM2 COM3 COM4 COM5 COM6 COM7 COM8 COM9
LPT1 LPT2 LPT3 LPT4 LPT5 LPT6 LPT7 LPT8 LPT9
) ;

# if $filename contains any invalid codepoints, we will get a warning like this
#
# Operation "pattern match (m//)" returns its argument for non-Unicode code point
#
# so silence it for now.

no warnings;

return "Portability Issue: '$1' is a reserved Windows device name"
if $filename =~ /^($badDosFilename)$/io ;

# Can't have the device name with an extension either
return "Portability Issue: '$1' is a reserved Windows device name"
if $filename =~ /^($badDosFilename)\./io ;
}

state $illegal_windows_chars = join '|', map { quotemeta } qw( < > : " | ? * );
return "Portability Issue: Windows filename cannot contain '$1'"
Expand Down Expand Up @@ -6957,7 +6968,8 @@ sub displayFileInfo
# TODO - check for badly formed content
if ($LanguageEncodingFlag && $opt_use_LanguageEncodingFlag)
{
eval { $name = Encode::decode('utf8', $name, Encode::FB_CROAK ) } ;
# use "utf-8-strict" to catch invalid codepoints
eval { $name = Encode::decode('utf-8-strict', $name, Encode::FB_CROAK ) } ;
::warning $FH->tell() - length $name, "Could not decode 'utf8' $type: " . cleanEval $@
if $@ ;
}
Expand Down
12 changes: 6 additions & 6 deletions t/002-main.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use Fcntl qw(SEEK_SET);

my $tests_per_zip = 6 ;
my $tests_per_zip_full = $tests_per_zip * 2 * 3 * 2 ;
plan tests => 214 * $tests_per_zip_full ;
plan tests => 215 * $tests_per_zip_full ;

sub run;
sub compareWithGolden;
Expand Down Expand Up @@ -148,8 +148,8 @@ for my $dir (sort keys %dirs)

my %controlData = parseControl($dir);

# default options assume
my $options = '--encoding utf8 --output-encoding utf8';
# default options assume utf8
my $options = '--encoding utf-8-strict --output-encoding utf8';

if (-e "$dir/options" )
{
Expand Down Expand Up @@ -793,7 +793,7 @@ sub zapGolden
my $locale_charset = getNativeLocale();
$_[0] =~ s<^(#\s*System Default Encoding:\s*)('.+?')><$1'$locale_charset'>mg ;

# Encode changed from using utf8 to UTF-8 at some point
my $UTF = getUTF8String();
$_[0] =~ s<\S+ (\S+) does not map to Unicode><$UTF $1 does not map to Unicode>g ;
# # Encode changed from using utf8 to UTF-8 at some point
# my $UTF = getUTF8String();
# $_[0] =~ s<\S+ (\S+) does not map to Unicode><$UTF $1 does not map to Unicode>g ;
}
4 changes: 2 additions & 2 deletions t/files/0000-errors/encoding/efs-and-bad-utf8/stdout
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
001A Filename Length 0003 (3)
001C Extra Length 0000 (0)
#
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: utf8 "\xE5" does not map to Unicode
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: UTF-8 "\xE5" does not map to Unicode
#
001E Filename 'aåa'
#
Expand Down Expand Up @@ -56,7 +56,7 @@
[Bits 28-31] 08 (8) 'Regular File'
0051 Local Header Offset 00000000 (0)
#
# WARNING: Offset 0x55: Could not decode 'utf8' Filename: utf8 "\xE5" does not map to Unicode
# WARNING: Offset 0x55: Could not decode 'utf8' Filename: UTF-8 "\xE5" does not map to Unicode
#
0055 Filename 'aåa'
#
Expand Down
4 changes: 2 additions & 2 deletions t/files/0000-errors/encoding/efs-and-bad-utf8/stdout-v
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
001A 001B 0002 03 00 Filename Length 0003 (3)
001C 001D 0002 00 00 Extra Length 0000 (0)
#
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: utf8 "\xE5" does not map to Unicode
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: UTF-8 "\xE5" does not map to Unicode
#
001E 0020 0003 61 E5 61 Filename 'aåa'
#
Expand Down Expand Up @@ -57,7 +57,7 @@
[Bits 28-31] 08 (8) 'Regular File'
0051 0054 0004 00 00 00 00 Local Header Offset 00000000 (0)
#
# WARNING: Offset 0x55: Could not decode 'utf8' Filename: utf8 "\xE5" does not map to Unicode
# WARNING: Offset 0x55: Could not decode 'utf8' Filename: UTF-8 "\xE5" does not map to Unicode
#
0055 0057 0003 61 E5 61 Filename 'aåa'
#
Expand Down
10 changes: 10 additions & 0 deletions t/files/0000-errors/encoding/invalid-codepoint/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#
# check that a UTF filename with an invalid codepoint is detected
# this will be trapped when using "utf-8-strict" when decoding in the code
# but not when "utf8" is used.

perl -MIO::Compress::Zip=:all -e 'zip \"abcd" => "test.zip", Minimal => 1, Stream => 0, efs =>1, Name => "\xFA\x80\xA0\x89\xB6" '

# codepoint 0x2020276 in UTF8 is
"\xFA\x80\xA0\x89\xB6" '

60 changes: 60 additions & 0 deletions t/files/0000-errors/encoding/invalid-codepoint/stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

0000 LOCAL HEADER #1 04034B50 (67324752)
0004 Extract Zip Spec 14 (20) '2.0'
0005 Extract OS 00 (0) 'MS-DOS'
0006 General Purpose Flag 0800 (2048)
[Bits 1-2] 0 'Normal Compression'
[Bit 11] 1 'Language Encoding'
0008 Compression Method 0008 (8) 'Deflated'
000A Last Mod Date/Time 5879ADE9 (1484369385) 'Mon Mar 25 21:47:18 2024'
000E CRC ED82CD11 (3984772369)
0012 Compressed Size 00000006 (6)
0016 Uncompressed Size 00000004 (4)
001A Filename Length 0005 (5)
001C Extra Length 0000 (0)
#
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: UTF-8 "\xFA\x80\xA0\x89\xB6" does not map to Unicode
#
001E Filename 'ú€ ‰¶'
0023 PAYLOAD KLJN..

0029 CENTRAL HEADER #1 02014B50 (33639248)
002D Created Zip Spec 14 (20) '2.0'
002E Created OS 03 (3) 'Unix'
002F Extract Zip Spec 14 (20) '2.0'
0030 Extract OS 00 (0) 'MS-DOS'
0031 General Purpose Flag 0800 (2048)
[Bits 1-2] 0 'Normal Compression'
[Bit 11] 1 'Language Encoding'
0033 Compression Method 0008 (8) 'Deflated'
0035 Last Mod Date/Time 5879ADE9 (1484369385) 'Mon Mar 25 21:47:18 2024'
0039 CRC ED82CD11 (3984772369)
003D Compressed Size 00000006 (6)
0041 Uncompressed Size 00000004 (4)
0045 Filename Length 0005 (5)
0047 Extra Length 0000 (0)
0049 Comment Length 0000 (0)
004B Disk Start 0000 (0)
004D Int File Attributes 0000 (0)
[Bit 0] 0 'Binary Data'
004F Ext File Attributes 81A40000 (2175008768)
[Bits 16-24] 01A4 (420) 'Unix attrib: rw-r--r--'
[Bits 28-31] 08 (8) 'Regular File'
0053 Local Header Offset 00000000 (0)
#
# WARNING: Offset 0x57: Could not decode 'utf8' Filename: UTF-8 "\xFA\x80\xA0\x89\xB6" does not map to Unicode
#
0057 Filename 'ú€ ‰¶'

005C END CENTRAL HEADER 06054B50 (101010256)
0060 Number of this disk 0000 (0)
0062 Central Dir Disk no 0000 (0)
0064 Entries in this disk 0001 (1)
0066 Total Entries 0001 (1)
0068 Size of Central Dir 00000033 (51)
006C Offset to Central Dir 00000029 (41)
0070 Comment Length 0000 (0)
#
# Warning Count: 2
#
# Done
63 changes: 63 additions & 0 deletions t/files/0000-errors/encoding/invalid-codepoint/stdout-v
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

0000 0003 0004 50 4B 03 04 LOCAL HEADER #1 04034B50 (67324752)
0004 0004 0001 14 Extract Zip Spec 14 (20) '2.0'
0005 0005 0001 00 Extract OS 00 (0) 'MS-DOS'
0006 0007 0002 00 08 General Purpose Flag 0800 (2048)
[Bits 1-2] 0 'Normal Compression'
[Bit 11] 1 'Language Encoding'
0008 0009 0002 08 00 Compression Method 0008 (8) 'Deflated'
000A 000D 0004 E9 AD 79 58 Last Mod Date/Time 5879ADE9 (1484369385) 'Mon Mar 25 21:47:18 2024'
000E 0011 0004 11 CD 82 ED CRC ED82CD11 (3984772369)
0012 0015 0004 06 00 00 00 Compressed Size 00000006 (6)
0016 0019 0004 04 00 00 00 Uncompressed Size 00000004 (4)
001A 001B 0002 05 00 Filename Length 0005 (5)
001C 001D 0002 00 00 Extra Length 0000 (0)
#
# WARNING: Offset 0x1E: Could not decode 'utf8' Filename: UTF-8 "\xFA\x80\xA0\x89\xB6" does not map to Unicode
#
001E 0022 0005 FA 80 A0 89 Filename 'ú€ ‰¶'
B6
0023 0028 0006 4B 4C 4A 4E PAYLOAD KLJN..
01 00

0029 002C 0004 50 4B 01 02 CENTRAL HEADER #1 02014B50 (33639248)
002D 002D 0001 14 Created Zip Spec 14 (20) '2.0'
002E 002E 0001 03 Created OS 03 (3) 'Unix'
002F 002F 0001 14 Extract Zip Spec 14 (20) '2.0'
0030 0030 0001 00 Extract OS 00 (0) 'MS-DOS'
0031 0032 0002 00 08 General Purpose Flag 0800 (2048)
[Bits 1-2] 0 'Normal Compression'
[Bit 11] 1 'Language Encoding'
0033 0034 0002 08 00 Compression Method 0008 (8) 'Deflated'
0035 0038 0004 E9 AD 79 58 Last Mod Date/Time 5879ADE9 (1484369385) 'Mon Mar 25 21:47:18 2024'
0039 003C 0004 11 CD 82 ED CRC ED82CD11 (3984772369)
003D 0040 0004 06 00 00 00 Compressed Size 00000006 (6)
0041 0044 0004 04 00 00 00 Uncompressed Size 00000004 (4)
0045 0046 0002 05 00 Filename Length 0005 (5)
0047 0048 0002 00 00 Extra Length 0000 (0)
0049 004A 0002 00 00 Comment Length 0000 (0)
004B 004C 0002 00 00 Disk Start 0000 (0)
004D 004E 0002 00 00 Int File Attributes 0000 (0)
[Bit 0] 0 'Binary Data'
004F 0052 0004 00 00 A4 81 Ext File Attributes 81A40000 (2175008768)
[Bits 16-24] 01A4 (420) 'Unix attrib: rw-r--r--'
[Bits 28-31] 08 (8) 'Regular File'
0053 0056 0004 00 00 00 00 Local Header Offset 00000000 (0)
#
# WARNING: Offset 0x57: Could not decode 'utf8' Filename: UTF-8 "\xFA\x80\xA0\x89\xB6" does not map to Unicode
#
0057 005B 0005 FA 80 A0 89 Filename 'ú€ ‰¶'
B6

005C 005F 0004 50 4B 05 06 END CENTRAL HEADER 06054B50 (101010256)
0060 0061 0002 00 00 Number of this disk 0000 (0)
0062 0063 0002 00 00 Central Dir Disk no 0000 (0)
0064 0065 0002 01 00 Entries in this disk 0001 (1)
0066 0067 0002 01 00 Total Entries 0001 (1)
0068 006B 0004 33 00 00 00 Size of Central Dir 00000033 (51)
006C 006F 0004 29 00 00 00 Offset to Central Dir 00000029 (41)
0070 0071 0002 00 00 Comment Length 0000 (0)
#
# Warning Count: 2
#
# Done
Binary file not shown.

0 comments on commit f52cb23

Please sign in to comment.