Skip to content

Commit

Permalink
fix for #669 and updated id table
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 19, 2020
1 parent 596cec2 commit 186d238
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3,743 deletions.
7 changes: 7 additions & 0 deletions code/corpus/getIdTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def startElement(self, name, attrs):
self.nb_file += 1
self.doc_id = None
self.current_identifier = {}
self.origin_file = None
if name == "fileDesc":
if attrs.getLength() != 0:
if "xml:id" in attrs:
Expand Down Expand Up @@ -71,6 +72,12 @@ def endElement(self, name):
pmcid = ""
if "PMC" in self.current_identifier:
pmcid = self.current_identifier["PMC"]
if self.origin_file is None and "origin" in self.current_identifier:
self.origin_file = self.current_identifier["origin"]
if self.origin_file is None:
print("Warning: origin file is missing for doc id", self.doc_id)
if self.doc_id is None:
print("Warning: doc id is missing for origin file", self.origin_file)
self.writer.writerow([self.doc_id, self.origin_file, doi, pmid, pmcid])
if name == 'teiCorpus':
self.output_file.close()
Expand Down
Loading

0 comments on commit 186d238

Please sign in to comment.