-
Notifications
You must be signed in to change notification settings - Fork 6
/
doi2pdfname.py
executable file
·53 lines (45 loc) · 1.34 KB
/
doi2pdfname.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
from lxml import etree
import sys
import re
# Parse the given file
parser = etree.XMLParser(remove_comments=True)
tree = etree.parse(sys.argv[1], parser=parser)
root = tree.getroot()
# Prepare some variables
names = []
title = ""
suffix = ""
for child in root.iter():
# print(child.tag)
if re.search('book$', child.tag):
suffix = "_BOOK"
if re.search('(content_item|journal_article|book)$', child.tag):
for grandchild in child.iter():
if re.search('surname$', grandchild.tag):
name = grandchild.text.strip()
# make each first letter of a name Uppercase
name = name.title()
# remove spaces
name = name.replace(' ', '')
names.append(name)
if title == "" and re.search('title$', grandchild.tag):
if re.search('}series_metadata$', grandchild.find('../..').tag):
# Springer books sometimes include a title of the book series,
# e.g., 10.1007/978-3-642-04490-8.
# We try to filter that out.
continue
title = grandchild.text.strip()
# remove double whitespaces
title = ' '.join(title.split())
# replace multiple dashes
title = re.sub('--','-', title)
filename = '_'.join(names) + "__" + title + suffix + ".pdf"
# replace some special characters in filename
items = {
'/': '_',
' ': '_'
}
for s, r in items.items():
filename = filename.replace(s, r)
print(filename)