Skip to content

Commit

Permalink
ArchivesSpace: Parse archivesspaceids.csv for filename->resource ID
Browse files Browse the repository at this point in the history
  • Loading branch information
Hwesta committed Jul 14, 2015
1 parent 9f8d6a7 commit 362ecce
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 0 deletions.
118 changes: 118 additions & 0 deletions src/MCPClient/lib/clientScripts/dip_generation_helper.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,133 @@
#!/usr/bin/env python2
from __future__ import print_function
import argparse
import ast
import csv
import sys

# dashboard
from django.db.models import Q
from main import models

# archivematicaCommon
import archivesspace
import archivematicaFunctions

def create_archivesspace_client():
"""
Create an ArchivesSpace client instance.
"""
# TODO use same code as views_as.py?
repl_dict = models.MicroServiceChoiceReplacementDic.objects.get(description='ArchivesSpace Config')
config = ast.literal_eval(repl_dict.replacementdic)

try:
client = archivesspace.ArchivesSpaceClient(
host=config['%host%'],
port=config['%port%'],
user=config['%user%'],
passwd=config['%passwd%']
)
except archivesspace.AuthenticationError:
print("Unable to authenticate to ArchivesSpace server using the default user! Check administrative settings.")
return None
except archivesspace.ConnectionError:
print("Unable to connect to ArchivesSpace server at the default location! Check administrative settings.")
return None
return client

def parse_archivesspaceids_csv(files):
"""
Parse filename and reference ID from archivesspaceids.csv files
:param files: List of paths to archivesspaceids.csv files
:return: Dict with {filename: reference ID}
"""
file_info = {}
# SIP is last, so takes priority
for csv_path in files:
with open(csv_path, 'rbU') as f:
reader = csv.reader(f)
for row in reader:
filename = row[0]
ref_id = row[1]
file_info[filename] = ref_id
return file_info

def parse_archivesspace_ids(sip_path, sip_uuid):
"""
Parse an archivesspaceids.csv to pre-populate the matching GUI.
:param sip_path: Path to the SIP to check for an archivesspaceids.csv
:param sip_uuid: UUID of the SIP to auto-populate ArchivesSpace IDs for
:return: 0 on success, 1 on failure
"""
# Check for archivesspaceids.csv
csv_paths = archivematicaFunctions.find_metadata_files(sip_path, 'archivesspaceids.csv')
if not csv_paths:
print('No archivesspaceids.csv files found, exiting')
return 0

file_info = parse_archivesspaceids_csv(csv_paths)
if not file_info:
print('No information found in archivesspaceids.csv files')
return 1
print(file_info)

# Create client
client = create_archivesspace_client()
if not client:
return 1

for filename, ref_id in file_info.items():
# Get file object (for fileUUID, to see if in DIP)
print(filename, ref_id, '%SIPLocation%' + filename)
try:

f = models.File.objects.get(
Q(originallocation='%transferDirectory%' + filename) |
Q(originallocation='%transferDirectory%objects/' + filename) |
Q(originallocation='%SIPDirectory%' + filename) |
Q(originallocation='%SIPDirectory%objects/' + filename),
sip_id=sip_uuid
)
except models.File.DoesNotExist:
print(filename, 'not found in database, skipping')
continue
except models.File.MultipleObjectsReturned:
print('Multiple entries for', filename, 'found in database, skipping')
continue
print('File:', f)

# Query ref_id to client for resource_id
resource = client.find_by_field('identifier', ref_id)
try:
resource_id = resource[0]['id']
except IndexError:
print('ArchivesSpace did not return an ID for', ref_id)
print('Returned', resource)
continue
print('Resource ID:', resource_id)

# Add to ArchivesSpaceDIPObjectResourcePairing
models.ArchivesSpaceDIPObjectResourcePairing.objects.create(
dipuuid=sip_uuid,
fileuuid=f.uuid,
resourceid=resource_id,
)

# Check if any files were processed?
return 0

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parse metadata for DIP helpers')
parser.add_argument('--sipUUID', required=True, help='%SIPUUID%')
parser.add_argument('--sipPath', required=True, help='%SIPDirectory%')
args = parser.parse_args()

# Return non-zero if any of the helpers fail
rc = 0
rc = rc or parse_archivesspace_ids(args.sipPath, args.sipUUID)
# rc = rc or another_dip_helper(args.sipPath, args.sipUUID)

sys.exit(rc)
26 changes: 26 additions & 0 deletions src/MCPClient/tests/fixtures/archivesspace.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[
{
"pk": "f8749dd2-0923-4b57-a074-45cd92ace56f",
"model": "main.microservicechoicereplacementdic",
"fields": {
"lastmodified": "2015-07-08T17:53:08",
"replaces": null,
"choiceavailableatlink": "a0db8294-f02a-4f49-a557-b1310a715ffc",
"description": "ArchivesSpace Config",
"replacementdic": "{'%port%': '8089', '%object_type%': u'', '%host%': u'localhost', '%xlink_show%': u'none', '%use_statement%': u'none', '%uri_prefix%': u'none', '%xlink_actuate%': u'none', '%access_conditions%': u'', '%use_conditions%': u'', '%restrictions%': u'no', '%passwd%': u'admin', '%user%': u'admin'}"
}
},
{
"pk": "a0db8294-f02a-4f49-a557-b1310a715ffc",
"model": "main.microservicechainlink",
"fields": {
"microservicegroup": "Upload DIP",
"defaultexitmessage": "Failed",
"reloadfilelist": true,
"lastmodified": "2015-07-08T17:53:08",
"defaultnextchainlink": "ff89a530-0540-4625-8884-5a2198dea05a",
"currenttask": "5ded9d05-dd24-484a-a8b2-73ec5d35aa63",
"replaces": null
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
objects/evelyn's photo.jpg,LI00022
Empty file.
25 changes: 25 additions & 0 deletions src/MCPClient/tests/fixtures/test_no_files_in_db.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
interactions:
- request:
body: password=admin
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
Content-Length: ['14']
Content-Type: [application/x-www-form-urlencoded]
User-Agent: [python-requests/2.7.0 CPython/2.7.6 Linux/3.13.0-43-generic]
method: POST
uri: http://localhost:8089/users/admin/login
response:
body: {string: !!python/unicode '{"session":"88373637ab6bd52646d959ad310c1f281fb4ba02073e64c3f4da50b43d67b24a","user":{"lock_version":1159,"username":"admin","name":"Administrator","is_system_user":true,"create_time":"2014-12-05T20:32:17Z","system_mtime":"2015-07-09T23:18:47Z","user_mtime":"2015-07-09T23:18:47Z","jsonmodel_type":"user","groups":[],"is_admin":false,"uri":"/users/1","agent_record":{"ref":"/agents/people/1"},"permissions":{"/repositories/2":["view_repository","update_accession_record","update_resource_record","update_digital_object_record"],"_archivesspace":[]}}}

'}
headers:
cache-control: ['private, must-revalidate, max-age=0']
content-length: ['551']
content-type: [application/json]
date: ['Thu, 09 Jul 2015 23:18:47 GMT']
server: [Jetty(8.1.5.v20120716)]
x-content-type-options: [nosniff]
status: {code: 200, message: OK}
version: 1
52 changes: 52 additions & 0 deletions src/MCPClient/tests/fixtures/test_parse_archivesspace_ids.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
interactions:
- request:
body: password=admin
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
Content-Length: ['14']
Content-Type: [application/x-www-form-urlencoded]
User-Agent: [python-requests/2.7.0 CPython/2.7.6 Linux/3.13.0-43-generic]
method: POST
uri: http://localhost:8089/users/admin/login
response:
body: {string: !!python/unicode '{"session":"4a108561f24f7850cb136cd765405fd563853b39b626e7cf3bfc4a99ef2bab0c","user":{"lock_version":898,"username":"admin","name":"Administrator","is_system_user":true,"create_time":"2014-12-05T20:32:17Z","system_mtime":"2015-07-08T21:38:45Z","user_mtime":"2015-07-08T21:38:45Z","jsonmodel_type":"user","groups":[],"is_admin":false,"uri":"/users/1","agent_record":{"ref":"/agents/people/1"},"permissions":{"/repositories/2":["view_repository","update_accession_record","update_resource_record","update_digital_object_record"],"_archivesspace":[]}}}

'}
headers:
cache-control: ['private, must-revalidate, max-age=0']
content-length: ['550']
content-type: [application/json]
date: ['Wed, 08 Jul 2015 21:38:45 GMT']
server: [Jetty(8.1.5.v20120716)]
x-content-type-options: [nosniff]
status: {code: 200, message: OK}
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [python-requests/2.7.0 CPython/2.7.6 Linux/3.13.0-43-generic]
X-ArchivesSpace-Session: [!!python/unicode '4a108561f24f7850cb136cd765405fd563853b39b626e7cf3bfc4a99ef2bab0c']
method: GET
uri: http://localhost:8089/repositories/2/search?q=identifier%3ALI00022&page=1&page_size=30
response:
body: {string: !!python/unicode '{"first_page":1,"last_page":1,"this_page":1,"offset_first":1,"offset_last":1,"total_hits":1,"results":[{"id":"/repositories/2/resources/1151","title":"Digital
futures : \nstrategies for the information age","primary_type":"resource","types":["resource"],"json":"{\"truncated\":
true}","suppressed":false,"publish":true,"system_generated":false,"repository":"/repositories/2","subjects":["Digital
preservation","Digital libraries"],"agents":["Deegan, Marilyn","Neal-Schuman
Publishers","Library Association Publishing Limited","Tanner, Simon"],"agent_uris":["/agents/people/265","/agents/corporate_entities/90","/agents/corporate_entities/98","/agents/people/266"],"creators":["Deegan,
Marilyn","Neal-Schuman Publishers","Library Association Publishing Limited","Tanner,
Simon"],"created_by":"admin","last_modified_by":"admin","user_mtime":"2014-12-06T15:31:03Z","system_mtime":"2014-12-06T15:31:03Z","create_time":"2014-12-06T15:31:03Z","level":"item","finding_aid_title":"","identifier":"LI00022","language":"eng","restrictions":"false","external_id":["367"],"location_uris":["/locations/7799"],"four_part_id":"LI00022","uri":"/repositories/2/resources/1151","jsonmodel_type":"resource"}],"facets":{"facet_queries":{},"facet_fields":{},"facet_dates":{},"facet_ranges":{}}}
'}
headers:
cache-control: ['private, must-revalidate, max-age=0']
content-length: ['1330']
content-type: [application/json]
date: ['Wed, 08 Jul 2015 21:38:45 GMT']
server: [Jetty(8.1.5.v20120716)]
x-content-type-options: [nosniff]
status: {code: 200, message: OK}
version: 1
60 changes: 60 additions & 0 deletions src/MCPClient/tests/test_dip_generation_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python2
import os
import sys
import vcr

from django.test import TestCase

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(THIS_DIR, '../lib/clientScripts')))
import dip_generation_helper

from main.models import ArchivesSpaceDIPObjectResourcePairing

class TestParseArchivesSpaceIDs(TestCase):

fixture_files = ['sip.json', 'files.json', 'archivesspace.json']
sip_uuid = '4060ee97-9c3f-4822-afaf-ebdf838284c3'
fixtures = [os.path.join(THIS_DIR, 'fixtures', p) for p in fixture_files]

def test_no_archivesspace_csv(self):
""" It should do nothing. """
sip_path = os.path.join(THIS_DIR, 'fixtures', 'emptysip', '')
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False
rc = dip_generation_helper.parse_archivesspace_ids(sip_path, self.sip_uuid)
assert rc == 0
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False

def test_empty_csv(self):
""" It should do nothing if the CSV is empty. """
sip_path = os.path.join(THIS_DIR, 'fixtures', 'empty_metadata_files', '')
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False
rc = dip_generation_helper.parse_archivesspace_ids(sip_path, self.sip_uuid)
assert rc == 1
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False

@vcr.use_cassette(os.path.join(THIS_DIR, 'fixtures', 'test_no_files_in_db.yaml'))
def test_no_files_in_db(self):
""" It should do nothing if no files are found in the DB. """
sip_path = os.path.join(THIS_DIR, 'fixtures', 'metadata_csv_sip', '')
sip_uuid = 'dne'
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False
rc = dip_generation_helper.parse_archivesspace_ids(sip_path, sip_uuid)
assert rc == 0
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False

@vcr.use_cassette(os.path.join(THIS_DIR, 'fixtures', 'test_parse_archivesspace_ids.yaml'))
def test_parse_to_db(self):
"""
It should create an entry in ArchivesSpaceDIPObjectResourcePairing for each file in archivesspaceids.csv
It should match the reference ID to a resource ID.
"""
sip_path = os.path.join(THIS_DIR, 'fixtures', 'archivesspaceid_sip', '')
assert ArchivesSpaceDIPObjectResourcePairing.objects.all().exists() is False
rc = dip_generation_helper.parse_archivesspace_ids(sip_path, self.sip_uuid)
assert rc == 0
assert len(ArchivesSpaceDIPObjectResourcePairing.objects.all()) == 1
r = ArchivesSpaceDIPObjectResourcePairing.objects.all()[0]
assert r.dipuuid == self.sip_uuid
assert r.fileuuid == 'ae8d4290-fe52-4954-b72a-0f591bee2e2f'
assert r.resourceid == '/repositories/2/resources/1151'

0 comments on commit 362ecce

Please sign in to comment.