Skip to content

Commit

Permalink
Change tar detection to use checksum instead of legal ranges of values (
Browse files Browse the repository at this point in the history
#466)

Previous detection used the rules from PRONOM. This commit replaces
those rules with the check from github.com/file/file: compute checksum
for header and check if recorded checksum matches.
Fixes #464
  • Loading branch information
gabriel-vasile authored Jan 3, 2024
1 parent 02af149 commit 4ea95cd
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 39 deletions.
109 changes: 73 additions & 36 deletions internal/magic/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package magic
import (
"bytes"
"encoding/binary"
"strconv"
)

var (
Expand Down Expand Up @@ -74,51 +75,87 @@ func CRX(raw []byte, limit uint32) bool {
}

// Tar matches a (t)ape (ar)chive file.
// Tar files are divided into 512 bytes records. First record contains a 257
// bytes header padded with NUL.
func Tar(raw []byte, _ uint32) bool {
// The "magic" header field for files in in UStar (POSIX IEEE P1003.1) archives
// has the prefix "ustar". The values of the remaining bytes in this field vary
// by archiver implementation.
if len(raw) >= 512 && bytes.HasPrefix(raw[257:], []byte{0x75, 0x73, 0x74, 0x61, 0x72}) {
return true
const sizeRecord = 512

// The structure of a tar header:
// type TarHeader struct {
// Name [100]byte
// Mode [8]byte
// Uid [8]byte
// Gid [8]byte
// Size [12]byte
// Mtime [12]byte
// Chksum [8]byte
// Linkflag byte
// Linkname [100]byte
// Magic [8]byte
// Uname [32]byte
// Gname [32]byte
// Devmajor [8]byte
// Devminor [8]byte
// }

if len(raw) < sizeRecord {
return false
}
raw = raw[:sizeRecord]

if len(raw) < 256 {
// First 100 bytes of the header represent the file name.
// Check if file looks like Gentoo GLEP binary package.
if bytes.Contains(raw[:100], []byte("/gpkg-1\x00")) {
return false
}

// The older v7 format has no "magic" field, and therefore must be identified
// with heuristics based on legal ranges of values for other header fields:
// https://www.nationalarchives.gov.uk/PRONOM/Format/proFormatSearch.aspx?status=detailReport&id=385&strPageToDisplay=signatures
rules := []struct {
min, max uint8
i int
}{
{0x21, 0xEF, 0},
{0x30, 0x37, 105},
{0x20, 0x37, 106},
{0x00, 0x00, 107},
{0x30, 0x37, 113},
{0x20, 0x37, 114},
{0x00, 0x00, 115},
{0x30, 0x37, 121},
{0x20, 0x37, 122},
{0x00, 0x00, 123},
{0x30, 0x37, 134},
{0x30, 0x37, 146},
{0x30, 0x37, 153},
{0x00, 0x37, 154},
// Get the checksum recorded into the file.
recsum, err := tarParseOctal(raw[148:156])
if err != nil {
return false
}
for _, r := range rules {
if raw[r.i] < r.min || raw[r.i] > r.max {
return false
}
sum1, sum2 := tarChksum(raw)
return recsum == sum1 || recsum == sum2
}

// tarParseOctal converts octal string to decimal int.
func tarParseOctal(b []byte) (int64, error) {
// Because unused fields are filled with NULs, we need to skip leading NULs.
// Fields may also be padded with spaces or NULs.
// So we remove leading and trailing NULs and spaces to be sure.
b = bytes.Trim(b, " \x00")

if len(b) == 0 {
return 0, nil
}
x, err := strconv.ParseUint(tarParseString(b), 8, 64)
if err != nil {
return 0, err
}
return int64(x), nil
}

for _, i := range []uint8{135, 147, 155} {
if raw[i] != 0x00 && raw[i] != 0x20 {
return false
}
// tarParseString converts a NUL ended bytes slice to a string.
func tarParseString(b []byte) string {
if i := bytes.IndexByte(b, 0); i >= 0 {
return string(b[:i])
}
return string(b)
}

return true
// tarChksum computes the checksum for the header block b.
// The actual checksum is written to same b block after it has been calculated.
// Before calculation the bytes from b reserved for checksum have placeholder
// value of ASCII space 0x20.
// POSIX specifies a sum of the unsigned byte values, but the Sun tar used
// signed byte values. We compute and return both.
func tarChksum(b []byte) (unsigned, signed int64) {
for i, c := range b {
if 148 <= i && i < 156 {
c = ' ' // Treat the checksum field itself as all spaces.
}
unsigned += int64(c)
signed += int64(int8(c))
}
return unsigned, signed
}
39 changes: 39 additions & 0 deletions internal/magic/archive_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package magic

import "testing"

func TestTarParseOctal(t *testing.T) {
tests := []struct {
in string
want int64
ok bool
}{
{"0000000\x00", 0, true},
{" \x0000000\x00", 0, true},
{" \x0000003\x00", 3, true},
{"00000000227\x00", 0227, true},
{"032033\x00 ", 032033, true},
{"320330\x00 ", 0320330, true},
{"0000660\x00 ", 0660, true},
{"\x00 0000660\x00 ", 0660, true},
{"0123456789abcdef", 0, false},
{"0123456789\x00abcdef", 0, false},
{"01234567\x0089abcdef", 342391, true},
{"0123\x7e\x5f\x264123", 0, false},
}

for _, tt := range tests {
got, err := tarParseOctal([]byte(tt.in))
ok := err == nil
if ok != tt.ok {
if tt.ok {
t.Errorf("parseOctal(%q): got parsing failure, want success", tt.in)
} else {
t.Errorf("parseOctal(%q): got parsing success, want failure", tt.in)
}
}
if got != tt.want {
t.Errorf("parseOctal(%q): got %d, want %d", tt.in, got, tt.want)
}
}
}
7 changes: 4 additions & 3 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,10 @@ var files = map[string]string{
"tar.oldgnu.tar": "application/x-tar",
"tar.posix.tar": "application/x-tar",
// tar.star.tar was generated with star 1.6.
"tar.star.tar": "application/x-tar",
"tar.ustar.tar": "application/x-tar",
"tar.v7.tar": "application/x-tar",
"tar.star.tar": "application/x-tar",
"tar.ustar.tar": "application/x-tar",
"tar.v7.tar": "application/x-tar",
"tar.issue464.tar": "application/x-tar",
// tar.v7-gnu.tar is a v7 tar archive generated with GNU tar 1.29.
"tar.v7-gnu.tar": "application/x-tar",
"tcl.tcl": "text/x-tcl",
Expand Down
Binary file added testdata/tar.issue464.tar
Binary file not shown.

0 comments on commit 4ea95cd

Please sign in to comment.