Skip to content

Commit

Permalink
data model: implements publication statement transformation for BNF
Browse files Browse the repository at this point in the history
* implement publication statement transformation for BNF

Co-Authored-by: Peter Weber peter.weber@rero.ch
Signed-off-by: Peter Weber <Peter.Weber@rero.ch>
  • Loading branch information
rerowep committed Sep 17, 2019
1 parent 5876f6a commit fae3a6c
Show file tree
Hide file tree
Showing 4 changed files with 366 additions and 151 deletions.
92 changes: 67 additions & 25 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,46 +17,46 @@

"""Dojson utils."""

import re

import click
from dojson import Overdo, utils


class ReroIlsMarc21Overdo(Overdo):
def remove_trailing_punctuation(
data,
punctuation=',',
spaced_punctuation=':;/-'):
"""Remove trailing punctuation from data.
The punctuation parameter list the
punctuation characters to be removed
(preceded by a space or not).
The spaced_punctuation parameter list the
punctuation characters needing one or more preceding space(s)
in order to be removed.
"""
return re.sub(
r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation),
'',
data.rstrip()).rstrip()


class ReroIlsOverdo(Overdo):
"""Specialzed Overdo."""

blob_record = None
field_008_data = ''
lang_from_008 = None
date1_from_008 = None
date2_from_008 = None
date_type_from_008 = ''
langs_from_041_a = []
langs_from_041_h = []
alternate_graphic = {}

def __init__(self, bases=None, entry_point_group=None):
"""Init."""
super(ReroIlsMarc21Overdo, self).__init__(
super(ReroIlsOverdo, self).__init__(
bases=bases, entry_point_group=entry_point_group)

def do(self, blob, ignore_missing=True, exception_handlers=None):
"""Translate blob values and instantiate new model instance."""
self.blob_record = blob
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
result = super(ReroIlsOverdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
Expand Down Expand Up @@ -109,6 +109,48 @@ def get_subfields(self, field, code=None):
raise ValueError('data field expected (tag >= 01x)')
return subfields


class ReroIlsMarc21Overdo(ReroIlsOverdo):
"""Specialzed Overdo."""

field_008_data = ''
lang_from_008 = None
date1_from_008 = None
date2_from_008 = None
date_type_from_008 = ''
langs_from_041_a = []
langs_from_041_h = []
alternate_graphic = {}

def __init__(self, bases=None, entry_point_group=None):
"""Init."""
super(ReroIlsMarc21Overdo, self).__init__(
bases=bases, entry_point_group=entry_point_group)

def do(self, blob, ignore_missing=True, exception_handlers=None):
"""Translate blob values and instantiate new model instance."""
self.blob_record = blob
self.field_008_data = ''
self.date1_from_008 = None
self.date2_from_008 = None
self.date_type_from_008 = ''
fields_008 = self.get_fields(tag='008')
if fields_008:
self.field_008_data = self.get_control_field_data(
fields_008[0]).replace('\n', '')
self.date1_from_008 = self.field_008_data[7:11]
self.date2_from_008 = self.field_008_data[11:15]
self.date_type_from_008 = self.field_008_data[6]
self.init_lang()
self.init_country()
self.init_alternate_graphic()
result = super(ReroIlsMarc21Overdo, self).do(
blob,
ignore_missing=ignore_missing,
exception_handlers=exception_handlers
)
return result

def get_link_data(self, subfields_6_data):
"""Extract link and script data from subfields $6 data."""
link = None
Expand Down
23 changes: 2 additions & 21 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
import requests
from dojson import utils

from rero_ils.dojson.utils import ReroIlsMarc21Overdo
from rero_ils.dojson.utils import ReroIlsMarc21Overdo, \
remove_trailing_punctuation

marc21tojson = ReroIlsMarc21Overdo()

Expand All @@ -36,26 +37,6 @@ def list_of_langs(data):
return lang_codes


def remove_trailing_punctuation(
data,
punctuation=',',
spaced_punctuation=':;/-'):
"""Remove trailing punctuation from data.
The punctuation parameter list the
punctuation characters to be removed
(preceded by a space or not).
The spaced_punctuation parameter list the
punctuation characters needing one or more preceding space(s)
in order to be removed.
"""
return re.sub(
r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation),
'',
data.rstrip()).rstrip()


def get_mef_person_link(id, key, value):
"""Get mef person link."""
# https://mef.test.rero.ch/api/mef/?q=rero.rero_pid:A012327677
Expand Down
164 changes: 105 additions & 59 deletions rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,18 @@

"""rero-ils UNIMARC model definition."""


import re
from json import loads

from dojson import Overdo, utils
# from dojson.utils import force_list
from dojson import utils
from dojson.utils import GroupableOrderedDict, force_list
from pkg_resources import resource_string

unimarctojson = Overdo()
from rero_ils.dojson.utils import ReroIlsOverdo, remove_trailing_punctuation

unimarctojson = ReroIlsOverdo()

# @unimarctojson.over('__order__', '__order__')
# def order(self, key, value):
# """Preserve order of datafields."""
# order = []
# for field in value:
# name = unimarctojson.index.query(field)
# if name:
# name = name[0]
# else:
# name = field
# order.append(name)
#
# return order

@unimarctojson.over('type', 'leader')
def unimarctype(self, key, value):
Expand Down Expand Up @@ -180,50 +169,107 @@ def unimarctoauthor(self, key, value):
return author


@unimarctojson.over('publishers', '^210..')
@unimarctojson.over('provisionActivity', '^21[04]..')
@utils.for_each_value
@utils.ignore_value
def unimarcpublishers_publicationDate(self, key, value):
"""Get publisher.
publisher.name: 210 [$b repetitive]
publisher.place: 210 [$a repetitive]
publicationDate: 210 [$c repetitive] (take only the first one)
"""
lasttag = '?'
publishers = self.get('publishers', [])

publisher = {}
indexes = {}
lasttag = '?'
for tag in value['__order__']:
index = indexes.get(tag, 0)
data = value[tag]
if type(data) == tuple:
data = data[index]
if tag == 'a' and index > 0 and lasttag != 'a':
publishers.append(publisher)
publisher = {}
if tag == 'a':
place = publisher.get('place', [])
place.append(data)
publisher['place'] = place
elif tag == 'c':
name = publisher.get('name', [])
name.append(data)
publisher['name'] = name
elif tag == 'd' and index == 0:

# 4 digits
date = re.match(r'.*?(\d{4})', data).group(1)
self['publicationYear'] = int(date)

# create free form if different
if data != str(self['publicationYear']):
self['freeFormedPublicationDate'] = data
indexes[tag] = index + 1
lasttag = tag
publishers.append(publisher)
return publishers
def unimarcpublishers_provision_activity_publication(self, key, value):
"""Get provision activity dates."""
def build_place_or_agent_data(code, label, index, add_country):
type_per_code = {
'a': 'bf:Place',
'c': 'bf:Agent'
}
place_or_agent_data = {
'type': type_per_code[code],
'label': [{'value': remove_trailing_punctuation(label)}]
}
if add_country:
# country from 102
field_102 = unimarctojson.get_fields(tag='102')
if field_102:
field_102 = field_102[0]
country_codes = unimarctojson.get_subfields(field_102, 'a')
if country_codes:
place_or_agent_data['country'] = country_codes[0].lower()
return place_or_agent_data

publication = {}
ind2 = key[4]
type_per_ind2 = {
' ': 'bf:Publication',
'_': 'bf:Publication',
'0': 'bf:Publication',
'1': 'bf:Production',
'2': 'bf:Distribution',
'3': 'bf:Manufacture'
}
if ind2 == '4':
field_d = value.get('d')
if field_d:
field_d = force_list(field_d)[0]
copyrightDate = self.get('copyrightDate', [])
if field_d[0] == 'P':
copyrightDate.append('℗ ' + field_d[2:])
else:
copyrightDate.append('© ' + field_d)
self['copyrightDate'] = copyrightDate
else:
publication = {
'type': type_per_ind2[ind2],
'statement': [],
}
subfields_d = utils.force_list(value.get('d'))
if subfields_d:
subfield_d = subfields_d[0]
publication['date'] = subfield_d

if ind2 in (' ', '_', '1'):
# startDate: 100, pos. 9-12 endDate: 100, pos. 13-16
field_100 = unimarctojson.get_fields(tag='100')
if field_100:
field_100 = field_100[0]
data = unimarctojson.get_subfields(field_100, 'a')
if data:
try:
publication['startDate'] = str(int(data[0][9:13]))
except Exception:
pass
try:
publication['endDate'] = str(int(data[0][13:17]))
except Exception:
pass
elif ind2 in ('0', '2', '3'):
if subfields_d:
dates = subfield_d.split('-')
if dates[0]:
match = re.search(r'(^\d{4}$)', dates[0])
if match:
publication['startDate'] = dates[0]
if len(dates) >= 2:
match = re.search(r'(^\d{4}$)', dates[1])
if match:
publication['endDate'] = dates[1]

statement = []
if isinstance(value, GroupableOrderedDict):
items = value.iteritems(repeated=True)
else:
items = utils.iteritems(value)

index = 1
add_country = ind2 in (' ', '_', '1')
for blob_key, blob_value in items:
if blob_key in ('a', 'c'):
place_or_agent_data = build_place_or_agent_data(
blob_key, blob_value, index, add_country)
if blob_key == 'a':
add_country = False
statement.append(place_or_agent_data)
if blob_key != '__order__':
index += 1

publication['statement'] = statement
return publication or None


@unimarctojson.over('formats', '^215..')
Expand Down
Loading

0 comments on commit fae3a6c

Please sign in to comment.