-
Notifications
You must be signed in to change notification settings - Fork 4
/
dspace_existing_abstracts.py
26 lines (23 loc) · 1.03 KB
/
dspace_existing_abstracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from lxml import etree
import os
from os.path import join
path = 'C:/Users/djpillen/GitHub/vandura/Real_Masters_all'
the_files = {}
show_me = ['bhl.xml']
ignore = ['nispodcast.xml','bamdocs.xml','actonh.xml','stewartmary.xml','mullinsr.xml','pollackp.xml','saxj.xml','caen.xml','schoening.xml','shurtleffm.xml','ticecarol.xml','nsfnet.xml','ootbmpm.xml','gonzalesjess.xml']
for filename in os.listdir(path):
if filename in show_me:
tree = etree.parse(join(path,filename))
daos = tree.xpath('//dao')
for dao in daos:
if dao.attrib['href'].startswith('http://hdl.handle.net/2027.42'):
did = dao.getparent()
component = did.getparent()
if did.xpath('./odd') or component.xpath('./odd'):
if filename not in the_files:
the_files[filename] = []
the_files[filename].append(tree.getpath(dao))
for filename in the_files:
if filename in show_me:
for loc in the_files[filename]:
print loc