Skip to content

Commit

Permalink
data model: implement publication statement transformation for BNF
Browse files Browse the repository at this point in the history
* Implements the publication statement transformation for BNF.
* Adds base class for marc dojson transformation.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep and rerowep committed Sep 18, 2019
1 parent cfefd24 commit 6cec27d
Show file tree
Hide file tree
Showing 4 changed files with 371 additions and 153 deletions.
99 changes: 72 additions & 27 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,51 @@

"""Dojson utils."""

import re

import click
from dojson import Overdo, utils


class ReroIlsMarc21Overdo(Overdo):
"""Specialized Overdo.
def remove_trailing_punctuation(
data,
punctuation=',',
spaced_punctuation=':;/-'):
"""Remove trailing punctuation from data.
The punctuation parameter list the
punctuation characters to be removed
(preceded by a space or not).
The spaced_punctuation parameter list the
punctuation characters needing one or more preceding space(s)
in order to be removed.
"""
return re.sub(
r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation),
'',
data.rstrip()).rstrip()


class ReroIlsOverdo(Overdo):
"""Specialzed Overdo.
The purpose of this class is to store the blob record in order to
have access to all the record fields during the Overdo processing.
This class provide also record field manipulation functions.
"""

blob_record = None
field_008_data = ''
lang_from_008 = None
date1_from_008 = None
date2_from_008 = None
date_type_from_008 = ''
langs_from_041_a = []
langs_from_041_h = []
alternate_graphic = {}

def __init__(self, bases=None, entry_point_group=None):
"""ReroIlsMarc21Overdo init."""
super(ReroIlsMarc21Overdo, self).__init__(
"""ReroIlsOverdo init."""
super(ReroIlsOverdo, self).__init__(
bases=bases, entry_point_group=entry_point_group)

def do(self, blob, ignore_missing=True, exception_handlers=None):
"""Translate blob values and instantiate new model instance."""
self.blob_record = blob
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
result = super(ReroIlsOverdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
Expand Down Expand Up @@ -114,6 +114,51 @@ def get_subfields(self, field, code=None):
raise ValueError('data field expected (tag >= 01x)')
return subfields


class ReroIlsMarc21Overdo(ReroIlsOverdo):
"""Specialzed Overdo.
This class adds RERO Marc21 proboerties and functions to the ReroIlsOverdo.
"""

field_008_data = ''
lang_from_008 = None
date1_from_008 = None
date2_from_008 = None
date_type_from_008 = ''
langs_from_041_a = []
langs_from_041_h = []
alternate_graphic = {}

def __init__(self, bases=None, entry_point_group=None):
"""Init."""
super(ReroIlsMarc21Overdo, self).__init__(
bases=bases, entry_point_group=entry_point_group)

def do(self, blob, ignore_missing=True, exception_handlers=None):
"""Translate blob values and instantiate new model instance."""
self.blob_record = blob
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
)
return result

def get_link_data(self, subfields_6_data):
"""Extract link and script data from subfields $6 data."""
link = None
Expand Down
23 changes: 2 additions & 21 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
import requests
from dojson import utils

from rero_ils.dojson.utils import ReroIlsMarc21Overdo
from rero_ils.dojson.utils import ReroIlsMarc21Overdo, \
remove_trailing_punctuation

marc21tojson = ReroIlsMarc21Overdo()

Expand All @@ -36,26 +37,6 @@ def list_of_langs(data):
return lang_codes


def remove_trailing_punctuation(
data,
punctuation=',',
spaced_punctuation=':;/-'):
"""Remove trailing punctuation from data.
The punctuation parameter list the
punctuation characters to be removed
(preceded by a space or not).
The spaced_punctuation parameter list the
punctuation characters needing one or more preceding space(s)
in order to be removed.
"""
return re.sub(
r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation),
'',
data.rstrip()).rstrip()


def get_mef_person_link(id, key, value):
"""Get mef person link."""
# https://mef.test.rero.ch/api/mef/?q=rero.rero_pid:A012327677
Expand Down
164 changes: 105 additions & 59 deletions rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,18 @@

"""rero-ils UNIMARC model definition."""


import re
from json import loads

from dojson import Overdo, utils
# from dojson.utils import force_list
from dojson import utils
from dojson.utils import GroupableOrderedDict, force_list
from pkg_resources import resource_string

unimarctojson = Overdo()
from rero_ils.dojson.utils import ReroIlsOverdo, remove_trailing_punctuation

unimarctojson = ReroIlsOverdo()

# @unimarctojson.over('__order__', '__order__')
# def order(self, key, value):
# """Preserve order of datafields."""
# order = []
# for field in value:
# name = unimarctojson.index.query(field)
# if name:
# name = name[0]
# else:
# name = field
# order.append(name)
#
# return order

@unimarctojson.over('type', 'leader')
def unimarctype(self, key, value):
Expand Down Expand Up @@ -180,50 +169,107 @@ def unimarctoauthor(self, key, value):
return author


@unimarctojson.over('publishers', '^210..')
@unimarctojson.over('provisionActivity', '^21[04]..')
@utils.for_each_value
@utils.ignore_value
def unimarcpublishers_publicationDate(self, key, value):
"""Get publisher.
publisher.name: 210 [$b repetitive]
publisher.place: 210 [$a repetitive]
publicationDate: 210 [$c repetitive] (take only the first one)
"""
lasttag = '?'
publishers = self.get('publishers', [])

publisher = {}
indexes = {}
lasttag = '?'
for tag in value['__order__']:
index = indexes.get(tag, 0)
data = value[tag]
if type(data) == tuple:
data = data[index]
if tag == 'a' and index > 0 and lasttag != 'a':
publishers.append(publisher)
publisher = {}
if tag == 'a':
place = publisher.get('place', [])
place.append(data)
publisher['place'] = place
elif tag == 'c':
name = publisher.get('name', [])
name.append(data)
publisher['name'] = name
elif tag == 'd' and index == 0:

# 4 digits
date = re.match(r'.*?(\d{4})', data).group(1)
self['publicationYear'] = int(date)

# create free form if different
if data != str(self['publicationYear']):
self['freeFormedPublicationDate'] = data
indexes[tag] = index + 1
lasttag = tag
publishers.append(publisher)
return publishers
def unimarcpublishers_provision_activity_publication(self, key, value):
"""Get provision activity dates."""
def build_place_or_agent_data(code, label, index, add_country):
type_per_code = {
'a': 'bf:Place',
'c': 'bf:Agent'
}
place_or_agent_data = {
'type': type_per_code[code],
'label': [{'value': remove_trailing_punctuation(label)}]
}
if add_country:
# country from 102
field_102 = unimarctojson.get_fields(tag='102')
if field_102:
field_102 = field_102[0]
country_codes = unimarctojson.get_subfields(field_102, 'a')
if country_codes:
place_or_agent_data['country'] = country_codes[0].lower()
return place_or_agent_data

publication = {}
ind2 = key[4]
type_per_ind2 = {
' ': 'bf:Publication',
'_': 'bf:Publication',
'0': 'bf:Publication',
'1': 'bf:Production',
'2': 'bf:Distribution',
'3': 'bf:Manufacture'
}
if ind2 == '4':
field_d = value.get('d')
if field_d:
field_d = force_list(field_d)[0]
copyrightDate = self.get('copyrightDate', [])
if field_d[0] == 'P':
copyrightDate.append('℗ ' + field_d[2:])
else:
copyrightDate.append('© ' + field_d)
self['copyrightDate'] = copyrightDate
else:
publication = {
'type': type_per_ind2[ind2],
'statement': [],
}
subfields_d = utils.force_list(value.get('d'))
if subfields_d:
subfield_d = subfields_d[0]
publication['date'] = subfield_d

if ind2 in (' ', '_', '1'):
# startDate: 100, pos. 9-12 endDate: 100, pos. 13-16
field_100 = unimarctojson.get_fields(tag='100')
if field_100:
field_100 = field_100[0]
data = unimarctojson.get_subfields(field_100, 'a')
if data:
try:
publication['startDate'] = str(int(data[0][9:13]))
except Exception:
pass
try:
publication['endDate'] = str(int(data[0][13:17]))
except Exception:
pass
elif ind2 in ('0', '2', '3'):
if subfields_d:
dates = subfield_d.split('-')
if dates[0]:
match = re.search(r'(^\d{4}$)', dates[0])
if match:
publication['startDate'] = dates[0]
if len(dates) >= 2:
match = re.search(r'(^\d{4}$)', dates[1])
if match:
publication['endDate'] = dates[1]

statement = []
if isinstance(value, GroupableOrderedDict):
items = value.iteritems(repeated=True)
else:
items = utils.iteritems(value)

index = 1
add_country = ind2 in (' ', '_', '1')
for blob_key, blob_value in items:
if blob_key in ('a', 'c'):
place_or_agent_data = build_place_or_agent_data(
blob_key, blob_value, index, add_country)
if blob_key == 'a':
add_country = False
statement.append(place_or_agent_data)
if blob_key != '__order__':
index += 1

publication['statement'] = statement
return publication or None


@unimarctojson.over('formats', '^215..')
Expand Down
Loading

0 comments on commit 6cec27d

Please sign in to comment.