-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_sqlite
executable file
·271 lines (213 loc) · 7.15 KB
/
build_sqlite
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/python
import re
import zipfile
import os, os.path, sqlite3
from subprocess import Popen,PIPE
rTABLEROW = r' *([A-Za-z_]+) +([AN]) *([0-9.]+)([*]?) +([YN]) +(.*)'
# newer revisions have this format for a few of the table rows...
rTABLEROW2 = r' *([A-Z][A-Za-z_.+0-9]+) +([AN]) *([0-9.]+)([*]?) +()(.*)'
class Field(object):
"""Describes a single column in a table"""
def __init__(self, name, typ, size, key, blank, desc):
self.name, self.typ, self.size, self.key, self.desc = name, typ, size, key, desc
def __str__(self):
if self.typ == 'N':
parts = self.size.split('.')
if len(parts) == 2:
# For documentation, sqlite3 doesn't handle these.
sql_type = 'decimal(%s)' % (','.join(parts))
else:
sql_type = 'number'
elif self.typ == 'A':
sql_type = 'text'
else:
sql_type = ''
desc = self.desc.replace('\n',' ')
return '%s %s -- %s' % (self.name, sql_type, desc)
class Table(object):
"""Describes a table"""
def __init__(self, name, file):
self.name = name
self.file = file
self.fields = []
self.links = []
self.keys = []
def __str__(self):
ff = "\n, ".join(str(r) for r in self.fields)
ddl = "create table %s (\n %s\n);" % (self.file, ff)
return ddl
class Schema(object):
"""Parses schema from PDF description and reads table data from zip file."""
def __init__(self, zipname):
self.tables = []
self.table = None
self.peeked = None
self.zf = zipfile.ZipFile(zipname)
pdfname ,= [ info.filename for info in self.zf.filelist if info.filename.endswith('_doc.pdf')]
proc = Popen(['/usr/bin/env','pdftotext','-layout','-','-'],stdin=PIPE,stdout=PIPE,stderr=PIPE)
pdftext, _ = proc.communicate(input=self.zf.read(pdfname))
self.data = iter(pdftext.splitlines())
# Rob Pike style parser
state = self.start
try:
while state:
state = state()
except StopIteration:
pass
def peek(self):
if not self.peeked:
self.peeked = self.next()
return self.peeked
def next(self):
if self.peeked:
page,line = self.peeked
self.peeked = None
return page,line
line = self.data.next()
if line and line[0] == '\x0c':
return True, line[1:]
return False, line
def start(self):
page, line = self.next()
# continued table
if page and 'field name' in line.lower():
return self.tablebody
m = re.match(r'(.* file) \(file name = ([^ ]*)\).*',line, re.I)
if m:
self.table = Table(m.group(1),m.group(2))
self.tables.append(self.table)
return self.start
m = re.match(r'.*Links to the (.* file) by (the )?([A-Za-z_]+).*',line, re.I)
if m:
self.table.links.append((m.group(1), m.group(3)))
return self.start
m = re.match(r'Table [0-9]+[.].*File Format', line, re.I)
if m:
page, line = self.next()
if 'field name' in line.lower():
return self.tablebody
# label split across two lines... (p.32)
if 'name' in self.next()[1].lower():
return self.tablebody
print page,line
raise Exception("Expected 'field name'")
return self.start
def tablebody(self):
"""Want to make sure we get everything"""
page, line = self.next()
if page:
print "PAGE",line
if 'field name' not in line.lower():
if 'name' not in self.next()[1].lower():
raise Exception("unexpected weirdness in table header continuation on page break")
return self.tablebody
if not line:
if re.match(rTABLEROW, self.peek()[1]):
return self.tablebody
return self.start
# most tables end with a footnote
if line[0] == '*':
return self.start
m = re.match(rTABLEROW, line)
if not m:
m = re.match(rTABLEROW2, line)
if m:
row = list(m.groups())
row[0] = row[0].replace('.','').replace('+','_')
self.table.fields.append(Field(*row))
return self.tablebody
if re.match(r' {5,}.*', line):
self.table.fields[-1].desc += ' '+line.strip()
return self.tablebody
print "FAIL", line
return self.start
def get_rows(self, table):
try:
f = t.file+".txt"
data = schema.zf.read(f)
except KeyError:
return None
# splits lines, then re-merges continuation lines
lines = []
prev = None
for line in data.splitlines():
# line = line.strip("\r\n")
if line and line[0] == '~':
if prev:
lines.append(prev)
prev = line
else:
prev += '\n'+line
lines.append(prev)
# load data
values = []
for line in lines:
def clean(x):
if x == "":
return None
if x[0] == "~":
return unicode(x[1:-1],'cp1251')
if "." in x:
return float(x)
if '/' in x:
return unicode(x)
try:
return int(x)
except ValueError:
return unicode(x)
row = map(clean, line.split('^'))
if len(row) != len(t.fields):
print fields
print row
raise Exception("length mismatch for %s -- expected %d columns got %d" % (t.name, len(row), len(t.fields)))
values.append(row)
return values
# These three functions generate DDL for a table
version = 27
dbfile = 'sr%d.sqlite' % version
if os.path.exists(dbfile):
os.unlink(dbfile)
conn = sqlite3.connect(dbfile)
c = conn.cursor()
src_file = 'sr%d.zip' % version
url = 'http://www.ars.usda.gov/SP2UserFiles/Place/12354500/Data/SR%d/dnload/sr%dasc.zip' %(version,version)
if not os.path.exists(src_file):
print src_file, 'is missing'
print "trying to fetch", url
import requests
r = requests.get(url)
if r.status_code == 200:
open(src_file,'w').write(r.content)
else:
print "Failed", r.status_code, r.content
exit(-1)
print "Reading schema... (this requires pdftotext)"
schema = Schema(src_file)
print "loading tables...\n"
for t in schema.tables:
fields = [ field.name for field in t.fields]
values = schema.get_rows(t)
if values is None:
print "%s '%s' -- Skipped, not in zip file" % (t.file, t.name)
continue
# DDL
c.execute(str(t))
# Insert
r = c.executemany("insert into %s values (%s)" % (t.file,",".join('?' for f in fields)), values)
print "%s '%s' -- %d rows" % (t.file, t.name, r.rowcount)
# add a full text index.
if False:
c.execute("create virtual table food using fts3()")
c.execute("insert into food (docid, content) select ndb_no, long_desc||' '||comname||' '||manufacname from food_des")
conn.commit()
c.close()
# This query cleans out some data I personally don't need. commercial boxed food products, etc. I just want
# basic ingredients that appear in recipes.
"""
create table todo as select ndb_no from food_des where fdgrp_Cd in ('2500','3600','2200','0800') or manufacname != '';
delete from food_des where ndb_no in (select ndb_no from todo);
delete from nut_data where ndb_no in (select ndb_no from todo);
delete from footnote where ndb_no in (select ndb_no from todo);
delete from weight where ndb_no in (select ndb_no from todo);
drop table todo
"""