-
-
Notifications
You must be signed in to change notification settings - Fork 37
/
rdf.py
356 lines (287 loc) · 11.8 KB
/
rdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
import os
import re
import pathlib
from multiprocessing.dummy import Pool
import peewee
from path import Path as path
from bs4 import BeautifulSoup
from gutenbergtozim import logger
from gutenbergtozim.utils import exec_cmd, download_file
from gutenbergtozim.database import Author, Format, BookFormat, License, Book
from gutenbergtozim.utils import BAD_BOOKS_FORMATS, FORMAT_MATRIX, normalize
def setup_rdf_folder(rdf_url, rdf_path, force=False):
""" Download and Extract rdf-files """
rdf_tarball = download_rdf_file(rdf_url)
extract_rdf_files(rdf_tarball, rdf_path, force=force)
def download_rdf_file(rdf_url):
fname = "rdf-files.tar.bz2"
if path(fname).exists():
logger.info("\tdf-files.tar.bz2 already exists in {}".format(fname))
return fname
logger.info("\tDownloading {} into {}".format(rdf_url, fname))
download_file(rdf_url, pathlib.Path(fname).resolve())
return fname
def extract_rdf_files(rdf_tarball, rdf_path, force=False):
if path(rdf_path).exists() and not force:
logger.info("\tRDF-files folder already exists in {}".format(rdf_path))
return
logger.info("\tExtracting {} into {}".format(rdf_tarball, rdf_path))
# create destdir if not exists
dest = path(rdf_path)
dest.mkdir_p()
exec_cmd(
[
"tar",
"-C",
rdf_path,
"--strip-components",
"2",
"--extract",
"--no-same-owner",
"--no-same-permissions",
"-f",
rdf_tarball,
]
)
return
def parse_and_fill(rdf_path, concurrency, only_books=[], force=False):
logger.info("\tLooping throught RDF files in {}".format(rdf_path))
fpaths = []
for root, dirs, files in os.walk(rdf_path):
if root.endswith("999999"):
continue
# skip books outside of requsted list
if len(only_books) and path(root).basename() not in [
str(bid) for bid in only_books
]:
continue
for fname in files:
if fname in (".", "..", "pg0.rdf"):
continue
if not fname.endswith(".rdf"):
continue
fpaths.append(os.path.join(root, fname))
fpaths = sorted(
fpaths, key=lambda f: int(re.match(r".*/pg([0-9]+).rdf", f).groups()[0])
)
def ppf(x):
return parse_and_process_file(x, force)
Pool(concurrency).map(ppf, fpaths)
def parse_and_process_file(rdf_file, force=False):
if not path(rdf_file).exists():
raise ValueError(rdf_file)
gid = re.match(r".*/pg([0-9]+).rdf", rdf_file).groups()[0]
if Book.get_or_none(id=int(gid)):
logger.info("\tSkipping already parsed file {}".format(rdf_file))
return
logger.info("\tParsing file {}".format(rdf_file))
with open(rdf_file, "r", encoding="UTF-8") as f:
parser = RdfParser(f.read(), gid).parse()
if parser.license == "None":
logger.info("\tWARN: Unusable book without any information {}".format(gid))
elif parser.title == "":
logger.info("\tWARN: Unusable book without title {}".format(gid))
else:
save_rdf_in_database(parser)
class RdfParser:
def __init__(self, rdf_data, gid):
self.rdf_data = rdf_data
self.gid = gid
self.author_id = None
self.author_name = None
self.first_name = None
self.last_name = None
self.bookshelf = None
self.cover_image = 0
def parse(self):
soup = BeautifulSoup(self.rdf_data, "lxml")
# The tile of the book: this may or may not be divided
# into a new-line-seperated title and subtitle.
# If it is, then we will just split the title.
self.title = soup.find("dcterms:title")
self.title = self.title.text if self.title else "- No Title -"
self.title = self.title.split("\n")[0]
self.subtitle = " ".join(self.title.split("\n")[1:])
self.author_id = None
# Parsing for the bookshelf name
self.bookshelf = soup.find("pgterms:bookshelf")
if self.bookshelf:
self.bookshelf = self.bookshelf.find("rdf:value").text
# Search rdf to see if the image exists at the hard link
# https://www.gutenberg.ord/cache/epub/id/pg{id}.cover.medium.jpg
if soup.find("cover.medium.jpg"):
self.cover_image = 1
# Parsing the name of the Author. Sometimes it's the name of
# an organization or the name is not known and therefore
# the <dcterms:creator> or <marcrel:com> node only return
# "anonymous" or "unknown". For the case that it's only one word
# `self.last_name` will be null.
# Because of a rare edge case that the field of the parsed
# author's name
# has more than one comma we will join the first name in reverse,
# starting
# with the second item.
self.author = soup.find("dcterms:creator") or soup.find("marcrel:com")
if self.author:
self.author_id = self.author.find("pgterms:agent")
self.author_id = (
self.author_id.attrs["rdf:about"].split("/")[-1]
if "rdf:about" in getattr(self.author_id, "attrs", "")
else None
)
if self.author.find("pgterms:name"):
self.author_name = self.author.find("pgterms:name")
self.author_name = self.author_name.text.split(",")
if len(self.author_name) > 1:
self.first_name = " ".join(self.author_name[::-2]).strip()
self.last_name = self.author_name[0]
# Parsing the birth and (death, if the case) year of the author.
# These values are likely to be null.
self.birth_year = soup.find("pgterms:birthdate")
self.birth_year = self.birth_year.text if self.birth_year else None
self.birth_year = get_formatted_number(self.birth_year)
self.death_year = soup.find("pgterms:deathdate")
self.death_year = self.death_year.text if self.death_year else None
self.death_year = get_formatted_number(self.death_year)
# ISO 639-3 language codes that consist of 2 or 3 letters
self.language = soup.find("dcterms:language").find("rdf:value").text
# The download count of the books on www.gutenberg.org.
# This will be used to determine the popularity of the book.
self.downloads = soup.find("pgterms:downloads").text
# The book might be licensed under GPL, public domain
# or might be copyrighted
self.license = soup.find("dcterms:rights").text
# Finding out all the file types this book is available in
file_types = soup.find_all("pgterms:file")
self.file_types = {}
for x in file_types:
if not x.find("rdf:value").text.endswith("application/zip"):
k = x.attrs["rdf:about"].split("/")[-1]
v = x.find("rdf:value").text
self.file_types.update({k: v})
return self
def save_rdf_in_database(parser):
# Insert author, if it not exists
if parser.author_id:
try:
author_record = Author.get(gut_id=parser.author_id)
except Exception:
try:
author_record = Author.create(
gut_id=parser.author_id,
last_name=normalize(parser.last_name),
first_names=normalize(parser.first_name),
birth_year=parser.birth_year,
death_year=parser.death_year,
)
# concurrent workers might colide here so we retry once on IntegrityError
except peewee.IntegrityError:
author_record = Author.get(gut_id=parser.author_id)
else:
if parser.last_name:
author_record.last_name = normalize(parser.last_name)
if parser.first_name:
author_record.first_names = normalize(parser.first_name)
if parser.birth_year:
author_record.birth_year = parser.birth_year
if parser.death_year:
author_record.death_year = parser.death_year
author_record.save()
else:
# No author, set Anonymous
author_record = Author.get(gut_id="216")
# Get license
try:
license_record = License.get(name=parser.license)
except Exception:
license_record = None
# Insert book
try:
book_record = Book.get(id=parser.gid)
except Book.DoesNotExist:
book_record = Book.create(
id=parser.gid,
title=normalize(parser.title.strip()),
subtitle=normalize(parser.subtitle.strip()),
author=author_record, # foreign key
license=license_record, # foreign key
language=parser.language.strip(),
downloads=parser.downloads,
bookshelf=parser.bookshelf,
cover_page=parser.cover_image,
)
else:
book_record.title = normalize(parser.title.strip())
book_record.subtitle = normalize(parser.subtitle.strip())
book_record.author = author_record # foreign key
book_record.license = license_record # foreign key
book_record.language = parser.language.strip()
book_record.downloads = parser.downloads
book_record.save()
# insert pdf if not exists in parser.file_types
# this is done as presence of PDF on server and RDF is inconsistent
if not [
key
for key in parser.file_types
if parser.file_types[key].startswith("application/pdf")
]:
parser.file_types.update({"{id}-pdf.pdf": "application/pdf"})
# Insert formats
for file_type in parser.file_types:
# Sanitize MIME
mime = parser.file_types[file_type]
if not mime.startswith("text/plain"):
mime = re.sub(r"; charset=[a-z0-9-]+", "", mime)
# else:
# charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]
# Insert format type
pattern = re.sub(r"" + parser.gid, "{id}", file_type)
pattern = pattern.split("/")[-1]
bid = int(book_record.id)
if bid in BAD_BOOKS_FORMATS.keys() and mime in [
FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)
]:
logger.error(
"\t**** EXCLUDING **** {} for book #{} from list.".format(mime, bid)
)
continue
format_record, _ = Format.get_or_create(
mime=mime,
images=file_type.endswith(".images")
or parser.file_types[file_type] == "application/pdf",
pattern=pattern,
)
# Insert book format
BookFormat.get_or_create(
book=book_record, format=format_record # foreign key # foreign key
)
def get_formatted_number(num):
"""
Get a formatted string of a number from a not-predictable-string
that may or may not actually contain a number.
Append a BC notation to the number num with, if the
number is negative.
returns: a formatted string of the number, or num if
num is not negative or None.
"""
if not num:
return None
if all(["-" in num, num.replace("-", "").strip().isdigit()]):
return " ".join([num, "BC"])
return num
if __name__ == "__main__":
# Bacic Test with a sample rdf file
nums = ["{0:0=5d}".format(i) for i in range(21000, 40000)]
for num in nums:
print(num)
curd = os.path.dirname(os.path.realpath(__file__))
rdf = os.path.join(curd, "..", "rdf-files", num, "pg" + num + ".rdf")
if os.path.isfile(rdf):
data = ""
with open(rdf, "r") as f:
data = f.read()
parser = RdfParser(data, num).parse()
print(parser.first_name, parser.last_name)