-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDelimitedFileUtil.py
185 lines (176 loc) · 8.93 KB
/
DelimitedFileUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# Copyright 2019 Irwin Jungreis
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,x
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
DelimitedFile.py
Utilities for working with delimited files.
"""
from __future__ import division
from __future__ import print_function
import sys, itertools
from CommonUtils import DictClass, myopen, open_with_backup, strip_nl
assert sys.version_info[:2] == (2, 7), 'This is intended to run in Python 2.7.'
class DelimitedFileReader(object) :
"""
Iterator for reading a delimited file, where fields are separated by the delimiter
(e.g. Tab) and (optionally) the first (header) line contains the field names.
Lines starting with '#' may be skipped.
After opening the file, and, if desired, specifying mapping functions for particular
fields, call next (or iterate through the class instance), to read the lines after
the header.
Tip: to get a list of all the records, call list(reader).
"""
def __init__(self, fileName, delimiter = '\t', fieldNames = None,
intFields = None, floatFields = None, skipComments = True,
allowDupFields = False, skipQuotes = False, skipEmptyLines = False) :
""" If fieldNames is None, get from the first line of the file;
o.w., assume there is no header line.
Integer and float fields may be added later using add_mapper instead of here.
If skipComments, ignore lines that start with '#'.
If allowDupFields, allow several fields to have the same name; in that case
the result of accessing that field is ambiguous.
If skipQuotes, strip any double quote characters from start or end of string.
Excel sometimes adds these when saving Tab delimited files.
If skipEmptyLines, allow blank lines, and ignore them.
"""
self.inFile = myopen(fileName)
self.fileIter = (_skip_pounds(self.inFile, skipEmptyLines) if skipComments else
self.inFile)
self.delimiter = delimiter
if fieldNames == None :
self.prevLine = self.fileIter.next()
self.fieldNames = strip_nl(self.prevLine).split(self.delimiter)
else :
self.prevLine = None
self.fieldNames = list(fieldNames)
if not allowDupFields :
for name in self.fieldNames :
assert sum(n == name for n in self.fieldNames) == 1, 'Dup field %s' % name
self.fieldMappers = {}
if intFields != None :
self.add_mapper(intFields, int)
if floatFields != None :
self.add_mapper(floatFields, float)
self.skipQuotes = skipQuotes
self.skipEmptyLines = skipEmptyLines
def get_fieldNames(self) :
return self.fieldNames[:] # Return a copy so changing it won't affect this DFR
def add_mapper(self, fieldNameOrNames, mapper) :
# fieldNameOrNames can be a string or a collection of strings
# mapper = lambda valueString : value, e.g., int
# If mapper can't map it should raise ValueError so field will stay str.
if isinstance(fieldNameOrNames, str) :
fieldNameOrNames = [fieldNameOrNames]
for fieldName in fieldNameOrNames :
self.fieldMappers[fieldName] = mapper
class DelimitedFileRecord(DictClass) :
"""
This class holds the information from one line of a delimited file. The fields can
be accessed using [fieldName] as if it were a dictionary,
or using the .fieldName syntax, as if it were a class.
The fields are mapped at access time, not when the record is created (which is why
we don't just use a DictClass). Reasons for this delayed mapping:
- So when we write, we can write the unmapped fields. That way 1.10 doesn't
become 1.1.
- If a value is changed, the mapping will be applied to the new value.
"""
def __init__(self, itemsBeforeMap, fieldMappers) :
DictClass.__init__(self, itemsBeforeMap)
# self.fieldMappers = fieldMappers would be wrong because DictClass
# overrided __setattr__.
object.__setattr__(self, 'fieldMappers', fieldMappers)
def __getitem__(self, key) :
return self._map_value(key, DictClass.__getitem__(self, key))
def items(self) :
"Warning: won't work in 3.0 because it returns a list intead of an iteration."
return list(self._yielditems())
def values(self) :
"Warning: won't work in 3.0 because it returns a list intead of an iteration."
return [value for key, value in self._yielditems()]
def _yielditems(self) :
for key, value in DictClass.items(self) :
yield key, self._map_value(key, value)
def _map_value(self, key, value) :
if key in self.fieldMappers :
try :
return self.fieldMappers[key](value) # OK; . only calls __getattr__
# for nonmembers.
except ValueError : # int('') -> ValueError
pass
except TypeError : # int(None) -> TypeError. None can't be read but could
# be added later
pass
return value # Keep as str things that can't be mapped.
def get(self, key, default = None) :
# Override "get" so that it does mapping
return self[key] if self.has_key(key) else default
def next(self) :
"Read a line and return a DictClass like {fieldName : mappedFieldValue, ...}"
self.prevLine = self.fileIter.next()
fieldValues = strip_nl(self.prevLine).split(self.delimiter)
if self.skipQuotes :
fieldValues = [value.strip('"') for value in fieldValues]
items = itertools.izip_longest(# Handle missing fields at the end
self.fieldNames, fieldValues, fillvalue = '')
return self.DelimitedFileRecord(items, self.fieldMappers)
def get_prev_line(self) : # Return the most recently read line
return self.prevLine
def __iter__(self) :
return self
def close(self) :
self.inFile.close()
DFR = DelimitedFileReader
class DelimitedFileWriter(object) :
"""
Class for writing a delimited file.
"""
def __init__(self, fileName, fieldNames, delimiter = '\t', writeHeader = True,
backup = False) :
self.outFile = open_with_backup(fileName) if backup else myopen(fileName, 'w')
self.delimiter = delimiter
self.fieldNames = fieldNames[:] # Make a copy
for name in self.fieldNames :
assert sum(n == name for n in self.fieldNames) == 1, 'Duplicate field %s.' % name
if writeHeader :
print(delimiter.join(fieldNames), file = self.outFile)
def write_line(self, rec) :
"rec is a DelimitedFileRecord, DictClass, or dict with the values for each field."
"We use dict.__getitem__ to bypass mapping, so, e.g., 1.10 stays 1.10."
def get_it(fieldName) :
try :
value = dict.__getitem__(rec, fieldName)
if type(value) is unicode :
# Convert to str, since no encoding was specified when opening outFile
value = value.encode('utf8')
# Maybe we should instead convert unicode to str with ? for non-ascii
# chars via value = value.encode('ascii', 'replace') ...
else :
value = str(value)
return value
except KeyError :
return ''
print(self.delimiter.join(map(get_it, self.fieldNames)), file = self.outFile)
def write_comment(self, comment) :
"Write a line containing comment preceded by '#'"
print('#' + comment, file = self.outFile)
def close(self) :
self.outFile.close()
def flush(self) :
self.outFile.flush()
DFW = DelimitedFileWriter
def _skip_pounds(strIter, skipEmptyLines = False) :
"""
Return an iterator like strIter except it skips strings that start with "#".
If skipEmptyLines, also skip lines consisting of nothing but a newline character.
"""
return itertools.ifilterfalse(lambda s : s.startswith('#') or
(len(strip_nl(s)) == 0 if skipEmptyLines else
False),
strIter)