-
Notifications
You must be signed in to change notification settings - Fork 33
/
teitonlm.py
131 lines (100 loc) · 5.15 KB
/
teitonlm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
#@Author Dulip Withanage
import subprocess
import shutil
from lxml import etree
from nlmmanipulate import NlmManipulate
from teimanipulate import TeiManipulate
from debug import Debuggable
class TeiToNlm (Debuggable):
def __init__(self, gv):
self.gv = gv
self.module_name = "TEI to NLM"
self.debug = gv.debug
super(Debuggable, self).__init__()
def saxon_tei2nlm(self):
cmd = ["java", "-classpath", self.gv.java_class_path,
"-Dxml.catalog.files=" + self.gv.runtime_catalog_path,
"net.sf.saxon.Transform",
"-x", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-y", "org.apache.xml.resolver.tools.ResolvingXMLReader",
"-r", "org.apache.xml.resolver.tools.CatalogResolver",
"-o", self.gv.nlm_temp_file_path,
self.gv.tei_file_path,
self.gv.nlm_style_sheet_dir,
'autoBlockQuote=true'
]
return ' '.join(cmd)
def run_quirks(self, process_ref_lists):
manipulate = NlmManipulate(self.gv)
if self.gv.settings.get_setting('linebreaks-as-comments', self) == 'False':
# we need to convert every instance of <!--meTypeset:br--> to a new paragraph
manipulate.close_and_open_tag('comment()[. = "meTypeset:br"]', 'p')
manipulate.close_and_open_tag_not_styled('comment()[. = "meTypeset:br"]', 'title')
# we will replace inside table cells and titles regardless because these are real JATS break tags
manipulate.insert_break('comment()[. = "meTypeset:br"]', 'td')
manipulate.insert_break('comment()[. = "meTypeset:br"]', 'title')
manipulate.remove_empty_elements('//sec//p')
if process_ref_lists:
self.debug.print_debug(self, u'Finding potential reference lists')
manipulate.find_reference_list()
manipulate.tag_bibliography_refs()
manipulate.remove_empty_elements('//sec/list')
manipulate.remove_empty_elements('//sec/disp-quote')
manipulate.remove_empty_elements('//back/ref-list/ref')
def pre_cleanup(self):
manipulate = TeiManipulate(self.gv)
tree = manipulate.load_dom_tree()
# make sure that head elements are not encapsulated within any elements that will stop them from being
# correctly transformed by the XSL
allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body']
head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
count = 0
for element in head_elements:
current = element
while current is not None:
current = current.getparent()
if current is not None:
if current.tag and current.tag not in allowed:
current.tag = 'REMOVE'
count += 1
elif current.tag and current.tag in allowed:
break
else:
break
if count > 0:
etree.strip_tags(tree, 'REMOVE')
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count))
# split any p tags with sub-tags hi rend="Indent" into new elements
biblio_elements = tree.xpath('//tei:p'
'[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
'contains(@rend, "Text Body")]]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})
for parent in biblio_elements:
add_position = parent
for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
'contains(@rend, "Text Body")]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
new_p = etree.Element('p')
if 'rend' in parent.attrib:
new_p.attrib['rend'] = parent.attrib['rend']
add_position.addnext(new_p)
new_p.append(element)
add_position = new_p
manipulate.save_tree(tree)
self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))
def run_transform(self):
self.pre_cleanup()
self.gv.mk_dir(self.gv.nlm_folder_path)
java_command = self.saxon_tei2nlm()
self.debug.print_debug(self, u'Running saxon transform (TEI->NLM)')
subprocess.call(java_command, stdin=None, shell=True)
if self.gv.nlm_temp_file_path != self.gv.nlm_file_path:
shutil.copy2(self.gv.nlm_temp_file_path, self.gv.nlm_file_path)
def run(self, process_ref_lists, transform=True):
if transform:
self.run_transform()
if '--purenlm' in self.gv.settings.args and self.gv.settings.args['--purenlm']:
return
self.run_quirks(process_ref_lists)