This repository has been archived by the owner on Aug 2, 2024. It is now read-only.
forked from votinginfoproject/generate-feed-flat-files
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfiletype.py
68 lines (52 loc) · 1.74 KB
/
filetype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import magic
import csv
import xml.sax
TYPE_MAPPING = {"gzip":"gz", "bzip2":"bz2", "Zip":"zip", "RAR":"rar", "POSIX tar":"tar"}
COMPRESSION = ["gz", "bz2"]
ARCHIVED = ["zip", "rar", "tar"]
m = magic.Magic()
def get_type(fname):
ftype = m.from_file(fname)
for k in TYPE_MAPPING.keys():
if k in ftype:
return TYPE_MAPPING[k]
#solutions here from http://stackoverflow.com/questions/9084228/python-to-check-if-a-gzipped-file-is-xml-or-csv
#and http://stackoverflow.com/questions/2984888/check-if-file-has-a-csv-format-with-python
if 'text' in ftype:
with open(fname, 'rb') as fh:
try:
xml.sax.parse(fh, xml.sax.ContentHandler())
return 'xml'
except: # SAX' exceptions are not public
pass
fh.seek(0)
#if line count is less than 2, csv type check will not be accurate
#so txt is returned as default
linecount = 0
for line in fh:
linecount += 1
if linecount > 2:
break
if linecount <= 2:
return 'txt'
fh.seek(0)
try:
dialect = csv.Sniffer().sniff(fh.read(1024))
return 'csv'
except csv.Error:
pass
return 'txt'
def is_compression(fname):
ftype = get_type(fname)
return is_compression_by_type(ftype)
def is_compression_by_type(ftype):
if ftype in COMPRESSION:
return True
return False
def is_archived(fname):
ftype = get_type(fname)
return is_archived_by_type(ftype)
def is_archived_by_type(ftype):
if ftype in ARCHIVED:
return True
return False