-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_reader.py
92 lines (73 loc) · 2.6 KB
/
data_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import logging
import pandas as pd
import platform
import subprocess
from xlrd import XLRDError
from zipfile import ZipFile
logger = logging.getLogger(__name__)
def _get_zip_file_sizes(zip_file):
'''list all the files in the zip archive and their sizes'''
with ZipFile(zip_file) as f:
file_names = f.namelist()
sizes = {}
for file_name in file_names:
file_info = f.getinfo(file_name)
sizes[file_name] = file_info.file_size
return sizes
def _read_zip(zip_file, reader):
'''Read zip file into pandas (since sometimes pd.read_csv
can't handle multiple files in the archive)
Args:
zip_file: path to zip file
reader: pandas reader function
pd.read_xlsx | pd.read_csv
'''
sizes = _get_zip_file_sizes(zip_file)
# assumption is that the dataset is the biggest file
# gets the filename of the biggest file (key)
biggest_file_name = max(sizes)
with ZipFile(zip_file) as f:
_f = f.open(biggest_file_name)
df = reader(_f)
return df
def _infer_reader(file_name):
'''infer pandas reader from the file name'''
# TODO: make it more robust
logger.info('Inferring the reader from the file name')
if 'xlsx' in file_name:
reader = pd.read_excel
elif 'csv' in file_name:
reader = pd.read_csv
else:
raise AssertionError('Cannot infer the the reader from file name')
return reader
# PUBLIC
def read_zip(zip_file):
'''Try reading using pandas, and fall back on a custom reader if it fails'''
reader = _infer_reader(zip_file)
try:
logger.info('Trying to read using pandas')
df = reader(zip_file)
except (ValueError, XLRDError):
logger.info('Pandas reader failed, falling back on the custom zip reader')
df = _read_zip(zip_file, reader)
return df
def get_n_rows(file_name):
'''Get row count'''
_system = platform.system()
# use filename as default string separator
if _system == 'Darwin':
sep = file_name
base_command = 'wc -l {}'
elif _system == 'Windows':
sep = file_name.upper()
base_command = 'find /c /v "" {}'
else:
raise ValueError('System "{}" is not supported'.format(_system))
full_command = base_command.format(file_name)
command_output = subprocess.Popen(full_command,
shell=True,
stdout=subprocess.PIPE).stdout.read()
clean_output = ''.join(command_output.split(sep))
digit_output = filter(lambda x: x.isdigit(), clean_output)
return int(digit_output)