diff --git a/rero_ils/dojson/utils.py b/rero_ils/dojson/utils.py index 3af5a532c1..c492ae1a5d 100644 --- a/rero_ils/dojson/utils.py +++ b/rero_ils/dojson/utils.py @@ -17,46 +17,46 @@ """Dojson utils.""" +import re + import click from dojson import Overdo, utils -class ReroIlsMarc21Overdo(Overdo): +def remove_trailing_punctuation( + data, + punctuation=',', + spaced_punctuation=':;/-'): + """Remove trailing punctuation from data. + + The punctuation parameter list the + punctuation characters to be removed + (preceded by a space or not). + + The spaced_punctuation parameter list the + punctuation characters needing one or more preceding space(s) + in order to be removed. + """ + return re.sub( + r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation), + '', + data.rstrip()).rstrip() + + +class ReroIlsOverdo(Overdo): """Specialzed Overdo.""" blob_record = None - field_008_data = '' - lang_from_008 = None - date1_from_008 = None - date2_from_008 = None - date_type_from_008 = '' - langs_from_041_a = [] - langs_from_041_h = [] - alternate_graphic = {} def __init__(self, bases=None, entry_point_group=None): """Init.""" - super(ReroIlsMarc21Overdo, self).__init__( + super(ReroIlsOverdo, self).__init__( bases=bases, entry_point_group=entry_point_group) def do(self, blob, ignore_missing=True, exception_handlers=None): """Translate blob values and instantiate new model instance.""" self.blob_record = blob - self.field_008_data = '' - self.date1_from_008 = None - self.date2_from_008 = None - self.date_type_from_008 = '' - fields_008 = self.get_fields(tag='008') - if fields_008: - self.field_008_data = self.get_control_field_data( - fields_008[0]).replace('\n', '') - self.date1_from_008 = self.field_008_data[7:11] - self.date2_from_008 = self.field_008_data[11:15] - self.date_type_from_008 = self.field_008_data[6] - self.init_lang() - self.init_country() - self.init_alternate_graphic() - result = super(ReroIlsMarc21Overdo, self).do( + result = super(ReroIlsOverdo, self).do( blob, ignore_missing=ignore_missing, exception_handlers=exception_handlers @@ -109,6 +109,48 @@ def get_subfields(self, field, code=None): raise ValueError('data field expected (tag >= 01x)') return subfields + +class ReroIlsMarc21Overdo(ReroIlsOverdo): + """Specialzed Overdo.""" + + field_008_data = '' + lang_from_008 = None + date1_from_008 = None + date2_from_008 = None + date_type_from_008 = '' + langs_from_041_a = [] + langs_from_041_h = [] + alternate_graphic = {} + + def __init__(self, bases=None, entry_point_group=None): + """Init.""" + super(ReroIlsMarc21Overdo, self).__init__( + bases=bases, entry_point_group=entry_point_group) + + def do(self, blob, ignore_missing=True, exception_handlers=None): + """Translate blob values and instantiate new model instance.""" + self.blob_record = blob + self.field_008_data = '' + self.date1_from_008 = None + self.date2_from_008 = None + self.date_type_from_008 = '' + fields_008 = self.get_fields(tag='008') + if fields_008: + self.field_008_data = self.get_control_field_data( + fields_008[0]).replace('\n', '') + self.date1_from_008 = self.field_008_data[7:11] + self.date2_from_008 = self.field_008_data[11:15] + self.date_type_from_008 = self.field_008_data[6] + self.init_lang() + self.init_country() + self.init_alternate_graphic() + result = super(ReroIlsMarc21Overdo, self).do( + blob, + ignore_missing=ignore_missing, + exception_handlers=exception_handlers + ) + return result + def get_link_data(self, subfields_6_data): """Extract link and script data from subfields $6 data.""" link = None diff --git a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py index 995512715b..970d33812c 100644 --- a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py +++ b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py @@ -23,7 +23,8 @@ import requests from dojson import utils -from rero_ils.dojson.utils import ReroIlsMarc21Overdo +from rero_ils.dojson.utils import ReroIlsMarc21Overdo, \ + remove_trailing_punctuation marc21tojson = ReroIlsMarc21Overdo() @@ -36,26 +37,6 @@ def list_of_langs(data): return lang_codes -def remove_trailing_punctuation( - data, - punctuation=',', - spaced_punctuation=':;/-'): - """Remove trailing punctuation from data. - - The punctuation parameter list the - punctuation characters to be removed - (preceded by a space or not). - - The spaced_punctuation parameter list the - punctuation characters needing one or more preceding space(s) - in order to be removed. - """ - return re.sub( - r'([{0}]|\s+[{1}])$'.format(punctuation, spaced_punctuation), - '', - data.rstrip()).rstrip() - - def get_mef_person_link(id, key, value): """Get mef person link.""" # https://mef.test.rero.ch/api/mef/?q=rero.rero_pid:A012327677 diff --git a/rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py b/rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py index 47cbe8a4ee..d8484f449c 100644 --- a/rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py +++ b/rero_ils/modules/documents/dojson/contrib/unimarctojson/model.py @@ -17,29 +17,18 @@ """rero-ils UNIMARC model definition.""" + import re from json import loads -from dojson import Overdo, utils -# from dojson.utils import force_list +from dojson import utils +from dojson.utils import GroupableOrderedDict, force_list from pkg_resources import resource_string -unimarctojson = Overdo() +from rero_ils.dojson.utils import ReroIlsOverdo, remove_trailing_punctuation +unimarctojson = ReroIlsOverdo() -# @unimarctojson.over('__order__', '__order__') -# def order(self, key, value): -# """Preserve order of datafields.""" -# order = [] -# for field in value: -# name = unimarctojson.index.query(field) -# if name: -# name = name[0] -# else: -# name = field -# order.append(name) -# -# return order @unimarctojson.over('type', 'leader') def unimarctype(self, key, value): @@ -180,50 +169,107 @@ def unimarctoauthor(self, key, value): return author -@unimarctojson.over('publishers', '^210..') +@unimarctojson.over('provisionActivity', '^21[04]..') +@utils.for_each_value @utils.ignore_value -def unimarcpublishers_publicationDate(self, key, value): - """Get publisher. - - publisher.name: 210 [$b repetitive] - publisher.place: 210 [$a repetitive] - publicationDate: 210 [$c repetitive] (take only the first one) - """ - lasttag = '?' - publishers = self.get('publishers', []) - - publisher = {} - indexes = {} - lasttag = '?' - for tag in value['__order__']: - index = indexes.get(tag, 0) - data = value[tag] - if type(data) == tuple: - data = data[index] - if tag == 'a' and index > 0 and lasttag != 'a': - publishers.append(publisher) - publisher = {} - if tag == 'a': - place = publisher.get('place', []) - place.append(data) - publisher['place'] = place - elif tag == 'c': - name = publisher.get('name', []) - name.append(data) - publisher['name'] = name - elif tag == 'd' and index == 0: - - # 4 digits - date = re.match(r'.*?(\d{4})', data).group(1) - self['publicationYear'] = int(date) - - # create free form if different - if data != str(self['publicationYear']): - self['freeFormedPublicationDate'] = data - indexes[tag] = index + 1 - lasttag = tag - publishers.append(publisher) - return publishers +def unimarcpublishers_provision_activity_publication(self, key, value): + """Get provision activity dates.""" + def build_place_or_agent_data(code, label, index, add_country): + type_per_code = { + 'a': 'bf:Place', + 'c': 'bf:Agent' + } + place_or_agent_data = { + 'type': type_per_code[code], + 'label': [{'value': remove_trailing_punctuation(label)}] + } + if add_country: + # country from 102 + field_102 = unimarctojson.get_fields(tag='102') + if field_102: + field_102 = field_102[0] + country_codes = unimarctojson.get_subfields(field_102, 'a') + if country_codes: + place_or_agent_data['country'] = country_codes[0].lower() + return place_or_agent_data + + publication = {} + ind2 = key[4] + type_per_ind2 = { + ' ': 'bf:Publication', + '_': 'bf:Publication', + '0': 'bf:Publication', + '1': 'bf:Production', + '2': 'bf:Distribution', + '3': 'bf:Manufacture' + } + if ind2 == '4': + field_d = value.get('d') + if field_d: + field_d = force_list(field_d)[0] + copyrightDate = self.get('copyrightDate', []) + if field_d[0] == 'P': + copyrightDate.append('℗ ' + field_d[2:]) + else: + copyrightDate.append('© ' + field_d) + self['copyrightDate'] = copyrightDate + else: + publication = { + 'type': type_per_ind2[ind2], + 'statement': [], + } + subfields_d = utils.force_list(value.get('d')) + if subfields_d: + subfield_d = subfields_d[0] + publication['date'] = subfield_d + + if ind2 in (' ', '_', '1'): + # startDate: 100, pos. 9-12 endDate: 100, pos. 13-16 + field_100 = unimarctojson.get_fields(tag='100') + if field_100: + field_100 = field_100[0] + data = unimarctojson.get_subfields(field_100, 'a') + if data: + try: + publication['startDate'] = str(int(data[0][9:13])) + except Exception: + pass + try: + publication['endDate'] = str(int(data[0][13:17])) + except Exception: + pass + elif ind2 in ('0', '2', '3'): + if subfields_d: + dates = subfield_d.split('-') + if dates[0]: + match = re.search(r'(^\d{4}$)', dates[0]) + if match: + publication['startDate'] = dates[0] + if len(dates) >= 2: + match = re.search(r'(^\d{4}$)', dates[1]) + if match: + publication['endDate'] = dates[1] + + statement = [] + if isinstance(value, GroupableOrderedDict): + items = value.iteritems(repeated=True) + else: + items = utils.iteritems(value) + + index = 1 + add_country = ind2 in (' ', '_', '1') + for blob_key, blob_value in items: + if blob_key in ('a', 'c'): + place_or_agent_data = build_place_or_agent_data( + blob_key, blob_value, index, add_country) + if blob_key == 'a': + add_country = False + statement.append(place_or_agent_data) + if blob_key != '__order__': + index += 1 + + publication['statement'] = statement + return publication or None @unimarctojson.over('formats', '^215..') diff --git a/tests/unit/test_documents_dojson_unimarc.py b/tests/unit/test_documents_dojson_unimarc.py index fe7e5c2e3f..65b58cce43 100644 --- a/tests/unit/test_documents_dojson_unimarc.py +++ b/tests/unit/test_documents_dojson_unimarc.py @@ -26,9 +26,9 @@ # type: leader -def test_unimarctotype(): +def test_unimarc_to_type(): """ - Test dojson marc21_to_type. + Test dojson unimarc_to_type. Books: LDR/6-7: am Journals: LDR/6-7: as @@ -155,7 +155,7 @@ def test_unimarctotitle(): # titleProper: [500$a repetitive] def test_unimarctotitlesProper(): - """Test dojson marc21titlesProper.""" + """Test dojson unimarc_titlesProper.""" unimarcxml = """ @@ -184,8 +184,8 @@ def test_unimarctotitlesProper(): # languages: 101 [$a] -def test_marc21languages(): - """Test dojson marc21languages.""" +def test_unimarc_languages(): + """Test dojson unimarc_languages.""" unimarcxml = """ @@ -308,14 +308,14 @@ def test_unimarctoauthors(): ] -# publishers.name: 210 [$c repetitive] -# publishers.place: 210 [$a repetitive] -# publicationDate: 210 [$d repetitive] (take only the first one) -def test_marc21publishers_publicationDate(): +def test_unimarc_publishers_provision_activity(): """Test dojson publishers publicationDate.""" unimarcxml = """ + + xxxxxxxxx2015????xxxxxxxxx + Lausanne Payot @@ -325,37 +325,75 @@ def test_marc21publishers_publicationDate(): """ unimarcjson = create_record(unimarcxml) data = unimarctojson.do(unimarcjson) - assert data.get('publishers') == [ - { - 'place': ['Lausanne'], - 'name': ['Payot'], - } - ] - assert data.get('publicationYear') == 2015 + assert data.get('provisionActivity') == [{ + 'type': 'bf:Publication', + 'statement': [ + { + 'label': [ + {'value': 'Lausanne'} + ], + 'type': 'bf:Place' + }, + { + 'label': [ + {'value': 'Payot'} + ], + 'type': 'bf:Agent' + }, + ], + 'startDate': '2015', + 'date': '2015' + }] unimarcxml = """ + + xxxxxxxxx19691999xxxxxxxxx + + + FR + - Paris - Lausanne - Payot - 1920 + [Paris] : + Desclée de Brouwer [puis] + Etudes augustiniennes, + [1969-1999] """ unimarcjson = create_record(unimarcxml) data = unimarctojson.do(unimarcjson) - assert data.get('publishers') == [ - { - 'place': ['Paris', 'Lausanne'], - 'name': ['Payot'], - } - ] - assert data.get('publicationYear') == 1920 + assert data.get('provisionActivity') == [{ + 'type': 'bf:Publication', + 'statement': [ + { + 'country': 'fr', + 'label': [ + {'value': '[Paris]'} + ], + 'type': 'bf:Place' + }, + { + 'label': [ + {'value': 'Desclée de Brouwer [puis]'} + ], + 'type': 'bf:Agent' + }, + { + 'label': [ + {'value': 'Etudes augustiniennes'} + ], + 'type': 'bf:Agent' + } + ], + 'startDate': '1969', + 'endDate': '1999', + 'date': '[1969-1999]' + }] unimarcxml = """ - + Paris Champion Genève @@ -366,24 +404,132 @@ def test_marc21publishers_publicationDate(): """ unimarcjson = create_record(unimarcxml) data = unimarctojson.do(unimarcjson) - assert data.get('publishers') == [ - { - 'place': ['Paris'], - 'name': ['Champion'] - }, - { - 'place': ['Genève'], - 'name': ['Droz'] - } - ] - assert data.get('freeFormedPublicationDate') == '1912-1955' - assert data.get('publicationYear') == 1912 + assert data.get('provisionActivity') == [{ + 'type': 'bf:Publication', + 'statement': [ + { + 'label': [ + {'value': 'Paris'} + ], + 'type': 'bf:Place' + }, + { + 'label': [ + {'value': 'Champion'} + ], + 'type': 'bf:Agent' + }, + { + 'label': [ + {'value': 'Genève'} + ], + 'type': 'bf:Place' + }, + { + 'label': [ + {'value': 'Droz'} + ], + 'type': 'bf:Agent' + } + ], + 'startDate': '1912', + 'endDate': '1955', + 'date': '1912-1955' + }] + + unimarcxml = """ + + + Lausanne + + + """ + unimarcjson = create_record(unimarcxml) + data = unimarctojson.do(unimarcjson) + assert data.get('provisionActivity') == [{ + 'type': 'bf:Production', + 'statement': [ + { + 'label': [ + {'value': 'Lausanne'} + ], + 'type': 'bf:Place' + } + ], + }] + + unimarcxml = """ + + + Lausanne + + + """ + unimarcjson = create_record(unimarcxml) + data = unimarctojson.do(unimarcjson) + assert data.get('provisionActivity') == [{ + 'type': 'bf:Distribution', + 'statement': [ + { + 'label': [ + {'value': 'Lausanne'} + ], + 'type': 'bf:Place' + } + ], + }] + + unimarcxml = """ + + + Lausanne + + + """ + unimarcjson = create_record(unimarcxml) + data = unimarctojson.do(unimarcjson) + assert data.get('provisionActivity') == [{ + 'type': 'bf:Manufacture', + 'statement': [ + { + 'label': [ + {'value': 'Lausanne'} + ], + 'type': 'bf:Place' + } + ], + }] + + +def test_unimarc_copyright_date(): + """Test copyright date.""" + unimarcxml = """ + + + 1919 + + + """ + unimarcjson = create_record(unimarcxml) + data = unimarctojson.do(unimarcjson) + assert data.get('copyrightDate') == ['© 1919'] + + unimarcxml = """ + + + P 1919 + + + """ + unimarcjson = create_record(unimarcxml) + data = unimarctojson.do(unimarcjson) + assert data.get('copyrightDate') == ['℗ 1919'] # extent: 215$a (the first one if many) # otherMaterialCharacteristics: 215$c (the first one if many) # formats: 215 [$d repetitive] -def test_marc21description(): +def test_unimarc_description(): """Test dojson extent, otherMaterialCharacteristics, formats.""" unimarcxml = """ @@ -425,7 +571,7 @@ def test_marc21description(): # series.name: [225$a repetitive] # series.number: [225$v repetitive] -def test_marc21series(): +def test_unimarc_series(): """Test dojson series.""" unimarcxml = """ @@ -454,7 +600,7 @@ def test_marc21series(): # abstract: [330$a repetitive] -def test_marc21abstract(): +def test_unimarc_abstract(): """Test dojson abstract.""" unimarcxml = """ @@ -470,7 +616,7 @@ def test_marc21abstract(): # identifiers:isbn: 010$a -def test_marc21identifiers(): +def test_unimarc_identifiers(): """Test dojson identifiers.""" unimarcxml = """ @@ -491,7 +637,7 @@ def test_marc21identifiers(): # notes: [300$a repetitive] -def test_marc21notes(): +def test_unimarc_notes(): """Test dojson notes.""" unimarcxml = """ @@ -522,7 +668,7 @@ def test_marc21notes(): # subjects: 600..617 $a,$b,$c,$d,$f # [duplicates could exist between several vocabularies, # if possible deduplicate] -def test_marc21subjects(): +def test_unimarc_subjects(): """Test dojson subjects.""" unimarcxml = """