diff --git a/internal/magic/archive.go b/internal/magic/archive.go index fec11f08..554ac4d4 100644 --- a/internal/magic/archive.go +++ b/internal/magic/archive.go @@ -3,6 +3,7 @@ package magic import ( "bytes" "encoding/binary" + "strconv" ) var ( @@ -74,51 +75,87 @@ func CRX(raw []byte, limit uint32) bool { } // Tar matches a (t)ape (ar)chive file. +// Tar files are divided into 512 bytes records. First record contains a 257 +// bytes header padded with NUL. func Tar(raw []byte, _ uint32) bool { - // The "magic" header field for files in in UStar (POSIX IEEE P1003.1) archives - // has the prefix "ustar". The values of the remaining bytes in this field vary - // by archiver implementation. - if len(raw) >= 512 && bytes.HasPrefix(raw[257:], []byte{0x75, 0x73, 0x74, 0x61, 0x72}) { - return true + const sizeRecord = 512 + + // The structure of a tar header: + // type TarHeader struct { + // Name [100]byte + // Mode [8]byte + // Uid [8]byte + // Gid [8]byte + // Size [12]byte + // Mtime [12]byte + // Chksum [8]byte + // Linkflag byte + // Linkname [100]byte + // Magic [8]byte + // Uname [32]byte + // Gname [32]byte + // Devmajor [8]byte + // Devminor [8]byte + // } + + if len(raw) < sizeRecord { + return false } + raw = raw[:sizeRecord] - if len(raw) < 256 { + // First 100 bytes of the header represent the file name. + // Check if file looks like Gentoo GLEP binary package. + if bytes.Contains(raw[:100], []byte("/gpkg-1\x00")) { return false } - // The older v7 format has no "magic" field, and therefore must be identified - // with heuristics based on legal ranges of values for other header fields: - // https://www.nationalarchives.gov.uk/PRONOM/Format/proFormatSearch.aspx?status=detailReport&id=385&strPageToDisplay=signatures - rules := []struct { - min, max uint8 - i int - }{ - {0x21, 0xEF, 0}, - {0x30, 0x37, 105}, - {0x20, 0x37, 106}, - {0x00, 0x00, 107}, - {0x30, 0x37, 113}, - {0x20, 0x37, 114}, - {0x00, 0x00, 115}, - {0x30, 0x37, 121}, - {0x20, 0x37, 122}, - {0x00, 0x00, 123}, - {0x30, 0x37, 134}, - {0x30, 0x37, 146}, - {0x30, 0x37, 153}, - {0x00, 0x37, 154}, + // Get the checksum recorded into the file. + recsum, err := tarParseOctal(raw[148:156]) + if err != nil { + return false } - for _, r := range rules { - if raw[r.i] < r.min || raw[r.i] > r.max { - return false - } + sum1, sum2 := tarChksum(raw) + return recsum == sum1 || recsum == sum2 +} + +// tarParseOctal converts octal string to decimal int. +func tarParseOctal(b []byte) (int64, error) { + // Because unused fields are filled with NULs, we need to skip leading NULs. + // Fields may also be padded with spaces or NULs. + // So we remove leading and trailing NULs and spaces to be sure. + b = bytes.Trim(b, " \x00") + + if len(b) == 0 { + return 0, nil + } + x, err := strconv.ParseUint(tarParseString(b), 8, 64) + if err != nil { + return 0, err } + return int64(x), nil +} - for _, i := range []uint8{135, 147, 155} { - if raw[i] != 0x00 && raw[i] != 0x20 { - return false - } +// tarParseString converts a NUL ended bytes slice to a string. +func tarParseString(b []byte) string { + if i := bytes.IndexByte(b, 0); i >= 0 { + return string(b[:i]) } + return string(b) +} - return true +// tarChksum computes the checksum for the header block b. +// The actual checksum is written to same b block after it has been calculated. +// Before calculation the bytes from b reserved for checksum have placeholder +// value of ASCII space 0x20. +// POSIX specifies a sum of the unsigned byte values, but the Sun tar used +// signed byte values. We compute and return both. +func tarChksum(b []byte) (unsigned, signed int64) { + for i, c := range b { + if 148 <= i && i < 156 { + c = ' ' // Treat the checksum field itself as all spaces. + } + unsigned += int64(c) + signed += int64(int8(c)) + } + return unsigned, signed } diff --git a/internal/magic/archive_test.go b/internal/magic/archive_test.go new file mode 100644 index 00000000..12c95050 --- /dev/null +++ b/internal/magic/archive_test.go @@ -0,0 +1,39 @@ +package magic + +import "testing" + +func TestTarParseOctal(t *testing.T) { + tests := []struct { + in string + want int64 + ok bool + }{ + {"0000000\x00", 0, true}, + {" \x0000000\x00", 0, true}, + {" \x0000003\x00", 3, true}, + {"00000000227\x00", 0227, true}, + {"032033\x00 ", 032033, true}, + {"320330\x00 ", 0320330, true}, + {"0000660\x00 ", 0660, true}, + {"\x00 0000660\x00 ", 0660, true}, + {"0123456789abcdef", 0, false}, + {"0123456789\x00abcdef", 0, false}, + {"01234567\x0089abcdef", 342391, true}, + {"0123\x7e\x5f\x264123", 0, false}, + } + + for _, tt := range tests { + got, err := tarParseOctal([]byte(tt.in)) + ok := err == nil + if ok != tt.ok { + if tt.ok { + t.Errorf("parseOctal(%q): got parsing failure, want success", tt.in) + } else { + t.Errorf("parseOctal(%q): got parsing success, want failure", tt.in) + } + } + if got != tt.want { + t.Errorf("parseOctal(%q): got %d, want %d", tt.in, got, tt.want) + } + } +} diff --git a/mimetype_test.go b/mimetype_test.go index 742e14c1..13e9caed 100644 --- a/mimetype_test.go +++ b/mimetype_test.go @@ -189,9 +189,10 @@ var files = map[string]string{ "tar.oldgnu.tar": "application/x-tar", "tar.posix.tar": "application/x-tar", // tar.star.tar was generated with star 1.6. - "tar.star.tar": "application/x-tar", - "tar.ustar.tar": "application/x-tar", - "tar.v7.tar": "application/x-tar", + "tar.star.tar": "application/x-tar", + "tar.ustar.tar": "application/x-tar", + "tar.v7.tar": "application/x-tar", + "tar.issue464.tar": "application/x-tar", // tar.v7-gnu.tar is a v7 tar archive generated with GNU tar 1.29. "tar.v7-gnu.tar": "application/x-tar", "tcl.tcl": "text/x-tcl", diff --git a/testdata/tar.issue464.tar b/testdata/tar.issue464.tar new file mode 100644 index 00000000..6fc9f433 Binary files /dev/null and b/testdata/tar.issue464.tar differ