forked from jpedrosa/sua
-
Notifications
You must be signed in to change notification settings - Fork 0
/
csv_table.swift
348 lines (315 loc) · 8.47 KB
/
csv_table.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
enum CSVTableParserEntry {
case Header
case HeaderExit
case Row
case RowExit
case Column
case ColumnQuoted
}
// Unicode data.
//
// The file format is like this:
//
// * The first row includes the serial number only. It is a sequencial number
// starting from 0 that helps to make the rows unique for updating and deleting.
//
// * The second row is the header row that helps to give a label to each column.
// The very first column is already labeled by default with "id".
//
// * From the third row onwards we find the actual data rows. They all start
// with the id column.
//
// * All rows should end with just the newline character(\n or 10).
//
// * All rows should include the same number of columns.
//
// * Data is in general represented in string format only. It simplifies it
// a lot and may match the use-case of users when they already have strings to
// compare the data with.
//
// * The column data follows the convention specified on this Wikipedia entry:
// https://en.wikipedia.org/wiki/Comma-separated_values
// Where columns can start with a double quote which can include within it:
//
// ** A data comma. E.g. "Hey, ho"
//
// ** A data newline character. E.g. "So \n it begins."
//
// ** And even a data double quote if it is escaped with another double
// quote. E.g. "Let's "" go"
//
public struct CSVTable {
var _path: String
public var serialId = 0
var stream = ByteStream()
var entryParser = CSVTableParserEntry.Header
var _header = [String]()
var columnGroup = CSVTableParserEntry.Header
var recordExit = CSVTableParserEntry.HeaderExit
var columnValue = ""
var _rows = [[String]]()
var _row = [String]()
var unescapedColumnValue = ""
public init(path: String) throws {
_path = path
try load()
}
mutating public func load() throws {
var f = try File(path: _path)
defer { f.close() }
stream.bytes = try f.readAllBytes()
if stream.eatWhileDigit() {
serialId = Int(stream.collectTokenString()!)!
if stream.eatOne(c: 10) { // Newline
entryParser = .Column
_header = []
columnGroup = .Header
recordExit = .HeaderExit
while !stream.isEol {
try next()
}
// Be nice and check for a last row without a trailing new line
// following it. Sometimes when manually editing a file, the last line
// could lose its new line.
if !_row.isEmpty {
try inRowExit()
}
} else {
throw CSVTableError.NewLine
}
} else {
throw CSVTableError.SerialId
}
}
mutating func next() throws {
switch entryParser {
case .Header:
try inHeader()
case .HeaderExit:
try inHeaderExit()
case .Row:
try inRow()
case .RowExit:
try inRowExit()
case .Column:
try inColumn()
case .ColumnQuoted:
try inColumnQuoted()
}
}
mutating func inHeader() throws {
_header.append(columnValue)
entryParser = .Column
}
mutating func inHeaderExit() throws {
if header.isEmpty {
throw CSVTableError.Header
}
entryParser = .Column
columnGroup = .Row
recordExit = .RowExit
_row = []
}
mutating func inRow() throws {
_row.append(columnValue)
entryParser = .Column
}
mutating func inRowExit() throws {
if _row.count != _header.count {
throw CSVTableError.Row
}
entryParser = .Column
_rows.append(_row)
_row = []
}
func matchCommaOrNewLine(c: UInt8) -> Bool {
return c == 44 || c == 10
}
mutating func inColumn() throws {
stream.startIndex = stream.currentIndex
if stream.eatDoubleQuote() {
unescapedColumnValue = ""
entryParser = .ColumnQuoted
stream.startIndex = stream.currentIndex
} else if stream.eatComma() {
columnValue = ""
entryParser = columnGroup
} else if stream.eatUntil(fn: matchCommaOrNewLine) {
columnValue = stream.collectTokenString()!
let _ = stream.eatComma()
entryParser = columnGroup
} else if stream.eatOne(c: 10) {
entryParser = recordExit
} else {
throw CSVTableError.Unreachable
}
}
mutating func inColumnQuoted() throws {
if stream.skipTo(c: 34) >= 0 { // "
if let s = stream.collectTokenString() {
unescapedColumnValue += s
}
let _ = stream.eatDoubleQuote()
stream.startIndex = stream.currentIndex
if !stream.eatDoubleQuote() { // Ends if not an escaped quote sequence: ""
if let s = stream.collectTokenString() {
unescapedColumnValue += s
}
columnValue = unescapedColumnValue
let _ = stream.eatComma()
entryParser = columnGroup
}
} else {
throw CSVTableError.Column
}
}
public var path: String { return _path }
public var header: [String] { return _header }
public var rows: [[String]] { return _rows }
// Don't include the id, since it will be automatically generated based on the
// next number on the sequence.
mutating public func insert(row: [String]) throws -> Int {
if row.count + 1 != header.count {
throw CSVTableError.Insert
}
var a = [String]()
let sid = serialId
a.append("\(sid)")
for s in row {
a.append(s)
}
_rows.append(a)
serialId += 1
return sid
}
// Alias for insert.
mutating public func append(row: [String]) throws -> Int {
return try insert(row: row)
}
// This will insert it if it does not exist, and it will keep whatever index
// id it was called with. This can help with data migration. The serialId
// can be adjusted accordingly afterwards.
mutating public func update(index: String, row: [String]) throws {
if row.count + 1 != header.count {
throw CSVTableError.Update
}
let n = findIndex(index: index)
if n >= 0 {
for i in 0..<row.count {
_rows[n][i + 1] = row[i]
}
} else {
var a = [String]()
a.append(index)
for s in row {
a.append(s)
}
_rows.append(a)
}
}
func findIndex(index: String) -> Int {
for i in 0..<_rows.count {
if _rows[i][0] == index {
return i
}
}
return -1
}
// If the record pointed at by index does not exist, simply ignore it.
mutating public func delete(index: String) {
let n = findIndex(index: index)
if n >= 0 {
_rows.remove(at: n)
}
}
mutating public func updateColumn(index: String, columnIndex: Int,
value: String) {
let n = findIndex(index: index)
if n >= 0 {
_rows[n][columnIndex] = value
}
}
mutating public func select(index: String) -> [String]? {
let n = findIndex(index: index)
if n >= 0 {
return _rows[n]
}
return nil
}
public var data: String {
var s = "\(serialId)\n"
var comma = false
for c in _header {
if comma { s += "," }
s += CSVTable.escape(string: c)
comma = true
}
s += "\n"
if !_rows.isEmpty {
for row in _rows {
var comma = false
for c in row {
if comma { s += "," }
s += CSVTable.escape(string: c)
comma = true
}
s += "\n"
}
s += "\n"
}
return s
}
public func save() throws {
let _ = try IO.write(filePath: path, string: data)
}
// This makes sure the data is escaped for double quote, comma and new line.
public static func escape(string: String) -> String {
let len = string.utf16.count
var i = 0
while i < len {
let c = string.utf16[i]
if c == 34 || c == 44 || c == 10 { // " , newline
i += 1
var s = "\""
s += string.utf16.substring(startIndex: 0, endIndex: i) ?? ""
if c == 34 {
s += "\""
}
var si = i
while i < len {
if string.utf16[i] == 34 {
s += string.utf16.substring(startIndex: si, endIndex: i + 1) ?? ""
s += "\""
si = i + 1
}
i += 1
}
s += string.utf16.substring(startIndex: si, endIndex: i) ?? ""
s += "\""
return s
}
i += 1
}
return string
}
public static func create(path: String, header: [String]) throws -> CSVTable {
var s = "0\nid"
for c in header {
s += ","
s += escape(string: c)
}
s += "\n"
let _ = try IO.write(filePath: path, string: s)
return try CSVTable(path: path)
}
}
enum CSVTableError: ErrorProtocol {
case SerialId
case NewLine
case Header
case Row
case Column
case Insert
case Update
case Unreachable
}