Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Canonical types and aliases #101

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 56 additions & 5 deletions lib/marcel/magic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,58 @@ def initialize(type)
# Option keys:
# * <i>:extensions</i>: String list or single string of file extensions
# * <i>:parents</i>: String list or single string of parent mime types
# * <i>:aliases</i>: String list or single string of aliased mime types
# * <i>:magic</i>: Mime magic specification
# * <i>:comment</i>: Comment string
def self.add(type, options)
extensions = [options[:extensions]].flatten.compact
extensions.each {|ext| EXTENSIONS[ext] = type }
TYPE_EXTS[type] = extensions

TYPE_ALIASES.delete(type)
[options[:aliases]].flatten.compact.each do |aliased|
TYPE_ALIASES[aliased] = type
end

parents = [options[:parents]].flatten.compact
TYPE_PARENTS[type] = parents unless parents.empty?
extensions.each {|ext| EXTENSIONS[ext] = type }

MAGIC.unshift [type, options[:magic]] if options[:magic]
end

# Removes a mime type from the dictionary. You might want to do this if
# Override the canonical MIME type with an alias or subtype.
def self.canonicalize(type, instead_of:)
raise ArgumentError, "#{instead_of} is an alias, not canonical" if TYPE_ALIASES[instead_of]

# Remove the alias or subtype first
remove(type)

# Replace the old canonical
EXTENSIONS.select { |_, t| t == instead_of }.each_key do |ext|
EXTENSIONS[ext] = type
end

TYPE_ALIASES.select { |_, t| t == instead_of }.each_key do |aliased|
TYPE_ALIASES[aliased] = type
end

TYPE_PARENTS[type] = TYPE_PARENTS.delete(instead_of)

MAGIC.select { |t, _| t == instead_of }.each { |pair| pair[0] = type }

# Alias the old canonical
TYPE_ALIASES[instead_of] = type
end

# Removes a mime type from the dictionary. You might want to do this if
# you're seeing impossible conflicts (for instance, application/x-gmc-link).
# * <i>type</i>: The mime type to remove. All associated extensions and magic are removed too.
# * <i>type</i>: The mime type to remove.
def self.remove(type)
EXTENSIONS.delete_if {|ext, t| t == type }
MAGIC.delete_if {|t, m| t == type }
EXTENSIONS.delete_if { |ext, t| t == type }
MAGIC.delete_if { |t, m| t == type }
TYPE_EXTS.delete(type)
TYPE_PARENTS.delete(type)
TYPE_ALIASES.delete_if { |aliased, canonical| aliased == type || canonical == type }
end

# Returns true if type is a text format
Expand All @@ -64,11 +97,24 @@ def extensions
TYPE_EXTS[type] || []
end

def canonical
if to = TYPE_ALIASES[type]
self.class.new(to)
else
self
end
end

# Get mime comment
def comment
nil # deprecated
end

# Lookup canonical mime type by mime type string
def self.by_type(type)
new(type.downcase).canonical if type
end

# Lookup mime type by file extension
def self.by_extension(ext)
ext = ext.to_s.downcase
Expand Down Expand Up @@ -111,9 +157,14 @@ def hash
alias == eql?

def self.child?(child, parent)
child, parent = canonical(child), canonical(parent)
child == parent || TYPE_PARENTS[child]&.any? {|p| child?(p, parent) }
end

def self.canonical(aliased_type)
by_type(aliased_type)&.type
end

def self.magic_match(io, method)
return magic_match(StringIO.new(io.to_s), method) unless io.respond_to?(:read)

Expand Down
38 changes: 32 additions & 6 deletions lib/marcel/mime_type.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,37 @@ class MimeType
BINARY = "application/octet-stream"

class << self
def extend(type, extensions: [], parents: [], magic: nil)
extensions = (Array(extensions) + Array(Marcel::TYPE_EXTS[type])).uniq
parents = (Array(parents) + Array(Marcel::TYPE_PARENTS[type])).uniq
Magic.add(type, extensions: extensions, magic: magic, parents: parents)
def canonicalize(type, instead_of:)
Magic.canonicalize type, instead_of: instead_of
end

def extend(type, extensions: nil, aliases: nil, parents: nil, magic: nil)
extensions = Array(extensions)
if extensions.any? && extensions.sort == Array(Marcel::TYPE_EXTS[type]).sort
warn "#{type} already has extensions #{extensions.inspect}"
end
extensions |= Array(Marcel::TYPE_EXTS[type])

aliases = Array(aliases)
existing_aliases = Marcel::TYPE_ALIASES.select { |_, t| t == type }.keys
if aliases.any? && aliases.sort == existing_aliases.sort
warn "#{type} already has aliases #{aliases.inspect}"
end
aliases |= existing_aliases

parents = Array(parents)
if parents.any? && parents.sort == Array(Marcel::TYPE_PARENTS[type]).sort
warn "#{type} already has parents #{parents.inspect}"
end
parents |= Array(Marcel::TYPE_PARENTS[type])

magic = Array(magic)
existing_magic = Marcel::MAGIC.select { |type, _| type == type }.map(&:last)
if magic.any? && magic == existing_magic
warn "#{type} already has magic matchers #{magic.inspect}"
end
Copy link
Member Author

@jeremy jeremy Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all pretty grotesque. Trying to limit the API footprint and not mess with the carefully tuned data tables.


Magic.add type, extensions: extensions, magic: magic, aliases: aliases, parents: parents
end

# Returns the most appropriate content type for the given file.
Expand All @@ -32,7 +59,6 @@ def for(pathname_or_io = nil, name: nil, extension: nil, declared_type: nil)
end

private

def for_data(pathname_or_io)
if pathname_or_io
with_io(pathname_or_io) do |io|
Expand Down Expand Up @@ -60,7 +86,7 @@ def for_extension(extension)
end

def for_declared_type(declared_type)
type = parse_media_type(declared_type)
type = Marcel::Magic.canonical(parse_media_type(declared_type))

# application/octet-stream is treated as an undeclared/missing type,
# allowing the type to be inferred from the filename. If there's no
Expand Down
30 changes: 15 additions & 15 deletions lib/marcel/mime_type/definitions.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,30 @@
Marcel::MimeType.extend "application/vnd.ms-powerpoint.template.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"
Marcel::MimeType.extend "application/vnd.ms-powerpoint.slideshow.macroenabled.12", parents: "application/vnd.openxmlformats-officedocument.presentationml.presentation"

Marcel::MimeType.extend "application/vnd.apple.pages", extensions: %w( pages ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.numbers", extensions: %w( numbers ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.keynote", extensions: %w( key ), parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.pages", parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.numbers", parents: "application/zip"
Marcel::MimeType.extend "application/vnd.apple.keynote", parents: "application/zip"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Including the file extensions suggests they're being extended, but they aren't. Omit them to clarify that we're changing the parent.


Marcel::MimeType.extend "audio/aac", extensions: %w( aac ), parents: "audio/x-aac"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This style of "overriding" a MIME type results in .aac files resolving to audio/aac but leaves existing magic bytes matches resolving to audio/x-aac.

Replaced with canonicalize "audio/aac", instead_of: "audio/x-aac".

Marcel::MimeType.extend("audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]])
# Upstream aliases to application/x-x509-cert. Override with a ;format=pem subtype.
Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"

Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]
Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )
Marcel::MimeType.extend "audio/ogg", extensions: %w( ogg oga ), magic: [[0, 'OggS', [[29, 'vorbis']]]]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really odd one, breaking the MIME hierarchy entirely and using the same magic matcher as audio/vorbis. Leaving this for later.

Marcel::MimeType.canonicalize "audio/aac", instead_of: "audio/x-aac"
Marcel::MimeType.canonicalize "audio/flac", instead_of: "audio/x-flac"
Marcel::MimeType.canonicalize "audio/x-wav", instead_of: "audio/vnd.wave"
Copy link
Member Author

@jeremy jeremy Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, change the canonical type rather than introducing misleading MIME subtypes.


Marcel::MimeType.extend "application/x-x509-ca-cert", magic: [[0, '-----BEGIN CERTIFICATE-----']], extensions: %w( pem ), parents: "application/x-x509-cert;format=pem"
Marcel::MimeType.extend "image/vnd.dwg", magic: [[0, "AC10"]]

Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]], extensions: %w( avif )
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]], extensions: %w( heif )
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]], extensions: %w( heic )
Marcel::MimeType.extend "image/avif", magic: [[4, "ftypavif"]]
Marcel::MimeType.extend "image/heif", magic: [[4, "ftypmif1"]]
Marcel::MimeType.extend "image/heic", magic: [[4, "ftypheic"]]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These matchers are already in the Tika data. Unclear whether these are simply defunct and can be removed.


Marcel::MimeType.extend "image/x-raw-sony", extensions: %w( arw ), parents: "image/tiff"
Marcel::MimeType.extend "image/x-raw-canon", extensions: %w( cr2 crw ), parents: "image/tiff"
Marcel::MimeType.extend "image/x-raw-canon", parents: "image/tiff"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Drop duplicate extensions.


Marcel::MimeType.extend "video/mp4", magic: [[4, "ftypisom"], [4, "ftypM4V "]], extensions: %w( mp4 m4v )

Marcel::MimeType.extend "audio/flac", magic: [[0, 'fLaC']], extensions: %w( flac ), parents: "audio/x-flac"
Marcel::MimeType.extend "audio/x-wav", magic: [[0, 'RIFF', [[8, 'WAVE']]]], extensions: %w( wav ), parents: "audio/vnd.wav"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switched to canonicalize.

Marcel::MimeType.extend "audio/mpc", magic: [[0, "MPCKSH"]], extensions: %w( mpc )

Marcel::MimeType.extend "font/ttf", magic: [[0, "\x00\x01\x00\x00"]], extensions: %w( ttf ttc )
Marcel::MimeType.extend "font/otf", magic: [[0, "OTTO"]], extensions: %w( otf ), parents: "font/ttf"
Marcel::MimeType.extend "application/vnd.adobe.flash.movie", magic: [[0, "FWS"], [0, "CWS"]], extensions: %w( swf )
Expand Down
143 changes: 143 additions & 0 deletions lib/marcel/tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2148,6 +2148,149 @@ module Marcel
'video/x-sgi-movie' => %w(movie),
'x-conference/x-cooltalk' => %w(ice), # Cooltalk Audio
}
TYPE_ALIASES = {
'application/bat' => 'application/x-bat',
'application/x-coreldraw' => 'application/coreldraw',
'application/x-cdr' => 'application/coreldraw',
'application/cdr' => 'application/coreldraw',
'image/x-cdr' => 'application/coreldraw',
'image/cdr' => 'application/coreldraw',
'application/x-setupscript' => 'application/inf',
'application/x-wine-extension-inf' => 'application/inf',
'application/x-javascript' => 'application/javascript',
'text/javascript' => 'application/javascript',
'application/x-java-vm' => 'application/java-vm',
'application/x-java' => 'application/java-vm',
'application/mac-binhex' => 'application/mac-binhex40',
'application/binhex' => 'application/mac-binhex40',
'application/vnd.ms-word' => 'application/msword',
'application/x-ogg' => 'audio/vorbis',
'application/msonenote' => 'application/onenote',
'application/x-pdf' => 'application/pdf',
'application/pgp' => 'application/pgp-encrypted',
'text/rss' => 'application/rss+xml',
'text/rtf' => 'application/rtf',
'application/smil' => 'application/smil+xml',
'application/x-kchart' => 'application/vnd.kde.kchart',
'application/x-kpresenter' => 'application/vnd.kde.kpresenter',
'application/x-kspread' => 'application/vnd.kde.kspread',
'application/x-kword' => 'application/vnd.kde.kword',
'application/x-koan' => 'application/vnd.koan',
'application/x-123' => 'application/vnd.lotus-1-2-3',
'application/x-mif' => 'application/vnd.mif',
'application/x-frame' => 'application/vnd.mif',
'application/msexcel' => 'application/vnd.ms-excel',
'application/mspowerpoint' => 'application/vnd.ms-powerpoint',
'application/ms-tnef' => 'application/vnd.ms-tnef',
'application/oxps' => 'application/vnd.ms-xpsdocument',
'application/x-vnd.oasis.opendocument.chart' => 'application/vnd.oasis.opendocument.chart',
'application/x-vnd.oasis.opendocument.chart-template' => 'application/vnd.oasis.opendocument.chart-template',
'application/vnd.oasis.opendocument.database' => 'application/vnd.oasis.opendocument.base',
'application/x-vnd.oasis.opendocument.formula' => 'application/vnd.oasis.opendocument.formula',
'application/x-vnd.oasis.opendocument.formula-template' => 'application/vnd.oasis.opendocument.formula-template',
'application/x-vnd.oasis.opendocument.graphics' => 'application/vnd.oasis.opendocument.graphics',
'application/x-vnd.oasis.opendocument.graphics-template' => 'application/vnd.oasis.opendocument.graphics-template',
'application/x-vnd.oasis.opendocument.image' => 'application/vnd.oasis.opendocument.image',
'application/x-vnd.oasis.opendocument.image-template' => 'application/vnd.oasis.opendocument.image-template',
'application/x-vnd.oasis.opendocument.presentation' => 'application/vnd.oasis.opendocument.presentation',
'application/x-vnd.oasis.opendocument.presentation-template' => 'application/vnd.oasis.opendocument.presentation-template',
'application/x-vnd.oasis.opendocument.spreadsheet' => 'application/vnd.oasis.opendocument.spreadsheet',
'application/x-vnd.oasis.opendocument.spreadsheet-template' => 'application/vnd.oasis.opendocument.spreadsheet-template',
'application/x-vnd.oasis.opendocument.text' => 'application/vnd.oasis.opendocument.text',
'application/x-vnd.oasis.opendocument.text-master' => 'application/vnd.oasis.opendocument.text-master',
'application/x-vnd.oasis.opendocument.text-template' => 'application/vnd.oasis.opendocument.text-template',
'application/x-vnd.oasis.opendocument.text-web' => 'application/vnd.oasis.opendocument.text-web',
'application/x-vnd.sun.xml.writer' => 'application/vnd.sun.xml.writer',
'application/vnd.ms-visio' => 'application/vnd.visio',
'image/x-targa' => 'image/x-tga',
'application/x-unix-archive' => 'application/x-archive',
'application/x-arj-compressed' => 'application/x-arj',
'application/x-dbm' => 'application/x-berkeley-db',
'application/vnd.debian.binary-package' => 'application/x-debian-package',
'application/x-Gnumeric-spreadsheet' => 'application/x-gnumeric',
'application/x-gzip' => 'application/gzip',
'application/x-gunzip' => 'application/gzip',
'application/gzipped' => 'application/gzip',
'application/gzip-compressed' => 'application/gzip',
'application/x-gzip-compressed' => 'application/gzip',
'gzip/document' => 'application/gzip',
'application/x-windows-installer' => 'application/x-ms-installer',
'application/x-msi' => 'application/x-ms-installer',
'application/x-rar' => 'application/x-rar-compressed',
'text/x-tex' => 'application/x-tex',
'text/x-texinfo' => 'application/x-texinfo',
'application/x-x509-ca-cert' => 'application/x-x509-cert',
'application/x-x509-user-cert' => 'application/x-x509-cert',
'text/xml' => 'application/xml',
'application/x-xml' => 'application/xml',
'text/x-dtd' => 'application/xml-dtd',
'text/xml-external-parsed-entity' => 'application/xml-external-parsed-entity',
'text/xsl' => 'application/xslt+xml',
'application/x-zip-compressed' => 'application/zip',
'application/x-deflate' => 'application/zlib',
'audio/x-m4a' => 'audio/mp4',
'audio/x-mp4a' => 'audio/mp4',
'audio/x-mpeg' => 'audio/mpeg',
'audio/x-ogg-flac' => 'audio/x-oggflac',
'audio/x-ogg-pcm' => 'audio/x-oggpcm',
'application/x-speex' => 'audio/speex',
'audio/aiff' => 'audio/x-aiff',
'audio/x-realaudio' => 'audio/x-pn-realaudio',
'audio/x-wav' => 'audio/vnd.wave',
'audio/wave' => 'audio/vnd.wave',
'audio/wav' => 'audio/vnd.wave',
'image/x-bmp' => 'image/bmp',
'image/x-ms-bmp' => 'image/bmp',
'image/x-emf' => 'image/emf',
'application/x-emf' => 'image/emf',
'application/x-ms-emz' => 'image/x-emf-compressed',
'image/hevc' => 'image/heic',
'image/hevc-sequence' => 'image/heic-sequence',
'video/jpm' => 'image/jpm',
'image/ntf' => 'image/nitf',
'image/x-psd' => 'image/vnd.adobe.photoshop',
'application/photoshop' => 'image/vnd.adobe.photoshop',
'image/x-dwg' => 'image/vnd.dwg',
'application/acad' => 'image/vnd.dwg',
'application/x-acad' => 'image/vnd.dwg',
'application/autocad_dwg' => 'image/vnd.dwg',
'application/dwg' => 'image/vnd.dwg',
'application/x-dwg' => 'image/vnd.dwg',
'application/x-autocad' => 'image/vnd.dwg',
'drawing/dwg' => 'image/vnd.dwg',
'image/x-icon' => 'image/vnd.microsoft.icon',
'image/x-dcx' => 'image/vnd.zbrush.dcx',
'image/x-pcx' => 'image/vnd.zbrush.pcx',
'image/x-pc-paintbrush' => 'image/vnd.zbrush.pcx',
'image/x-wmf' => 'image/wmf',
'application/x-msmetafile' => 'image/wmf',
'image/x-jb2' => 'image/x-jbig2',
'image/xcf' => 'image/x-xcf',
'application/x-mimearchive' => 'multipart/related',
'message/rfc2557' => 'multipart/related',
'drawing/x-dwf' => 'model/vnd.dwf',
'text/x-asm' => 'text/x-assembly',
'application/x-troff' => 'text/troff',
'application/x-troff-man' => 'text/troff',
'application/x-troff-me' => 'text/troff',
'application/x-troff-ms' => 'text/troff',
'text/x-c' => 'text/x-csrc',
'text/x-java' => 'text/x-java-source',
'text/x-properties' => 'text/x-java-properties',
'text/properties' => 'text/x-java-properties',
'application/x-httpd-jsp' => 'text/x-jsp',
'application/matlab-mat' => 'application/x-matlab-data',
'application/x-tcl' => 'text/x-tcl',
'video/x-daala' => 'video/daala',
'video/x-theora' => 'video/theora',
'video/x-ogg-uvs' => 'video/x-ogguvs',
'video/x-ogg-yuv' => 'video/x-oggyuv',
'video/x-ogg-rgb' => 'video/x-oggrgb',
'video/avi' => 'video/x-msvideo',
'video/msvideo' => 'video/x-msvideo',
'application/font-woff' => 'font/woff',
'application/font-woff2' => 'font/woff2',
}
TYPE_PARENTS = {
'application/bizagi-modeler' => %w(application/zip),
'application/dash+xml' => %w(application/xml),
Expand Down
7 changes: 7 additions & 0 deletions script/generate_tables.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_matches(mime, parent)

extensions = {}
types = {}
aliases = {}
magics = []

ARGV.each do |path|
Expand All @@ -137,6 +138,7 @@ def get_matches(mime, parent)
(doc/'mime-info/mime-type').each do |mime|
comments = Hash[*(mime/'_comment').map {|comment| [comment['xml:lang'], comment.inner_text] }.flatten]
type = mime['type']
(mime/'alias').each { |x| aliases[x['type']] = type }
subclass = (mime/'sub-class-of').map{|x| x['type']}
exts = (mime/'glob').map{|x| x['pattern'] =~ /^\*\.([^\[\]]+)$/ ? $1.downcase : nil }.compact
(mime/'magic').each do |magic|
Expand Down Expand Up @@ -222,6 +224,11 @@ def get_matches(mime, parent)
puts " '#{key}' => %w(#{exts}),#{comment}"
end
puts " }"
puts " TYPE_ALIASES = {"
aliases.each do |aliased, type|
puts " '#{aliased}' => '#{type}',"
end
puts " }"
puts " TYPE_PARENTS = {"
types.keys.sort.each do |key|
parents = types[key][1].sort.join(' ')
Expand Down
5 changes: 5 additions & 0 deletions test/declared_type_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,9 @@ class Marcel::MimeType::DeclaredTypeTest < Marcel::TestCase
test "ignores charset declarations" do
assert_equal "text/html", Marcel::MimeType.for(declared_type: "text/html; charset=utf-8")
end

test "resolves declared type to a canonical MIME type" do
aliased, canonical = Marcel::TYPE_ALIASES.first
assert_equal canonical, Marcel::MimeType.for(declared_type: aliased)
end
end
Loading
Loading