Skip to content

Commit

Permalink
documents: store source of authority identifiers
Browse files Browse the repository at this point in the history
* Closes rero#1885.
* Improves json schema reference definitions.
* Closes rero#1873, last bullet point of rero#1873 (comment).
* Adds identifiedBY values to subjects, genreForm and contribution if possible.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep and rerowep committed May 19, 2021
1 parent efca4ad commit 90095aa
Show file tree
Hide file tree
Showing 20 changed files with 3,535 additions and 6,139 deletions.
3,891 changes: 865 additions & 3,026 deletions data/documents_big.json

Large diffs are not rendered by default.

2,596 changes: 1,298 additions & 1,298 deletions data/documents_big.xml

Large diffs are not rendered by default.

1,252 changes: 304 additions & 948 deletions data/documents_small.json

Large diffs are not rendered by default.

718 changes: 359 additions & 359 deletions data/documents_small.xml

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@
}


re_identified = re.compile(r'\((.*)\)(.*)')


def error_print(*args):
"""Error printing to sdtout."""
msg = ''
Expand Down Expand Up @@ -1778,3 +1781,32 @@ def build_responsibility_data(responsibility_data):
index += 1
responsibilities.append(out_data)
return responsibilities


def build_identifier(data):
"""Build identifyBy for document_identifier-v0.0.1.json from $0.
:param data: data to build the identifiedBy from.
:returns: identifiedBy from $0 or None.
"""
sources = {
'RERO': 'RERO',
'RERO-RAMEAU': 'RERO-RAMEAU',
'IDREF': 'IdRef',
'GND': 'GND'
}
result = {}
data_0 = utils.force_list(data.get('0'))
if data_0:
match = re_identified.match(data_0[0])
try:
result['value'] = match.group(2)
identifier_type = sources.get(match.group(1).upper())
if identifier_type:
result['type'] = identifier_type
else:
result['type'] = 'bf:Local'
result['source'] = match.group(1)
except IndexError:
click.echo(f'WARNING creating identifier: {data_0}')
return result or None
141 changes: 79 additions & 62 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
from dojson.utils import GroupableOrderedDict

from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
add_note, build_responsibility_data, build_string_from_subfields, \
error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
get_field_items, get_field_link_data, make_year, not_repetitive, \
add_note, build_identifier, build_responsibility_data, \
build_string_from_subfields, error_print, \
extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
get_field_link_data, make_year, not_repetitive, \
remove_trailing_punctuation
from rero_ils.modules.utils import requests_retry_session

_ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
'a': 'rdami:1001',
Expand Down Expand Up @@ -207,8 +209,10 @@
'filmage_ch': re.compile(r'^from the age of')
}

_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')

_IDREF_REF_REGEX = re.compile(r'^(?i)\(IdRef\)(.*)?')
_RERO_REF_REGEX = re.compile(r'^(?i)\(RERO\)(.*)?')
_CONTRIBUTION_TAGS = ['100', '600', '610', '611', '630', '650', '651',
'655', '700', '710', '711']

marc21 = ReroIlsMarc21Overdo()

Expand All @@ -223,26 +227,39 @@ def get_contribution_link(bibid, reroid, id, key, value):
match = _IDREF_REF_REGEX.search(id)
if match:
pid = match.group(1)
if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
# contribution
if key[:3] in _CONTRIBUTION_TAGS:
url = f'{mef_url}idref/{pid}'
try:
request = requests.get(url=url)
except requests.exceptions.RequestException as err:
error_print('ERROR MEF ACCESS:', bibid, reroid, url, err)
return None
if request.status_code == requests.codes.ok:
response = requests_retry_session().get(url)
if response.status_code == requests.codes.ok:
return url.replace(test_host, prod_host)
else:
subfiels = []
for v, k in value.items():
if v != '__order__':
subfiels.append(f'${v} {k}')
subfiels = ' '.join(subfiels)
field = f'{key} {subfiels}'
error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
bibid, reroid, field, url,
request.status_code)
subfields = []
for v, k in value.items():
if v != '__order__':
subfields.append(f'${v} {k}')
subfields = ' '.join(subfields)
field = f'{key} {subfields}'
error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
bibid, reroid, field, url, response.status_code)
else:
# Try to get the MEF IdRef url from RERO id.
match = _RERO_REF_REGEX.search(id)
if match:
pid = match.group(1)
if key[:3] in _CONTRIBUTION_TAGS:
url = f'{mef_url}rero/{pid}'
response = requests_retry_session().get(url)
if response.status_code == requests.codes.ok:
data = response.json()
mef_url = data.get('links', {}).get('mef')
response = requests_retry_session().get(mef_url)
if response.status_code == requests.codes.ok:
data = response.json()
idref_url = data.get(
'metadata', {}).get('idref', {}).get('$ref')
if idref_url:
return idref_url.replace(test_host, prod_host)

error_print('WARNING MEF CONTRIBUTION GET:', bibid, reroid, key, id)


@marc21.over('issuance', 'leader')
Expand Down Expand Up @@ -504,17 +521,16 @@ def marc21_to_contribution(self, key, value):
"""Get contribution."""
if not key[4] == '2' and key[:3] in ['100', '700', '710', '711']:
agent = {}
if value.get('0'):
refs = utils.force_list(value.get('0'))
for ref in refs:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, ref, key, value)
if ref:
agent['$ref'] = ref
if key[:3] in ['100', '700']:
agent['type'] = 'bf:Person'
elif key[:3] in ['710', '711']:
agent['type'] = 'bf:Organisation'
refs = utils.force_list(value.get('0'))
if refs:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, refs[0], key, value)
if ref:
agent['$ref'] = ref
if key[:3] in ['100', '700']:
agent['type'] = 'bf:Person'
elif key[:3] in ['710', '711']:
agent['type'] = 'bf:Organisation'

# we do not have a $ref
if not agent.get('$ref') and value.get('a'):
Expand Down Expand Up @@ -563,6 +579,9 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if fuller_form_of_name:
agent['fuller_form_of_name'] = fuller_form_of_name
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier

# 710|711 Organisation
elif key[:3] in ['710', '711']:
Expand Down Expand Up @@ -605,6 +624,9 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if place:
agent['place'] = place
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier

if value.get('4'):
roles = []
Expand Down Expand Up @@ -769,6 +791,7 @@ def build_place():
place = build_place()
if place:
publication['place'] = [place]

publication['statement'] = build_statement(value, ind2)
if subfields_c:
subfield_c = subfields_c[0]
Expand All @@ -791,6 +814,11 @@ def build_place():
pass

publication['statement'].append(date)

identifier = build_identifier(value)
if identifier:
publication['identifiedBy'] = identifier

return publication or None


Expand Down Expand Up @@ -1458,11 +1486,6 @@ def marc21_to_subjects(self, key, value):
'610': False,
'611': True
}
source_per_prefix = {
'(RERO)': 'rero',
'(IDREF)': 'idref'
}

source_per_indicator_2 = {
'0': 'LCSH',
'2': 'MeSH'
Expand All @@ -1475,46 +1498,26 @@ def marc21_to_subjects(self, key, value):
if subfields_2:
subfield_2 = subfields_2[0]
subfields_a = utils.force_list(value.get('a', []))
source_prefix = ''

if subfield_2 == 'rero':
# TODO: create a link to MEF when possible
has_dollar_t = value.get('t')

subfields_0 = utils.force_list(value.get('0'))
subfield_0 = None
identified_by = None
if subfields_0:
# remove the source prefix in parenthesis like '(RERO)'
source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
source = source_per_prefix[source_prefix]
identified_by = {
'value': subfield_0,
'source': source,
'type': 'bf:Local'
}

if tag_key in ('600', '610', '611') and has_dollar_t:
tag_key += 't'

data_type = type_per_tag[tag_key]

start_with_digit = False
if tag_key == '650':
for subfield_a in subfields_a:
start_with_digit_regexp = re.compile(r'^\d')
match = start_with_digit_regexp.search(subfield_a)
if match:
start_with_digit = True
data_type = 'bf:Temporal'
break

subject = {
'source': 'rero',
'type': data_type,
}
if identified_by:
subject['identifiedBy'] = identified_by

string_build = build_string_from_subfields(
value,
Expand All @@ -1539,7 +1542,21 @@ def marc21_to_subjects(self, key, value):
if tag_key == '655':
field_key = 'genreForm'

if subject[field_data_per_tag[tag_key]]:
subfields_0 = utils.force_list(value.get('0'))
if data_type in ['bf:Person', 'bf:Organisation'] and subfields_0:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)
if ref:
subject = {
'$ref': ref,
'type': data_type,
}
if not subject.get('$ref'):
identifier = build_identifier(value)
if identifier:
subject['identifiedBy'] = identifier

if subject.get('$ref') or subject.get(field_data_per_tag[tag_key]):
subjects = self.get(field_key, [])
subjects.append(subject)
self[field_key] = subjects
Expand Down Expand Up @@ -1694,7 +1711,7 @@ def get_classif_type_and_subdivision_codes_from_980_2(subfield_2):
subject = {
'type': 'bf:Person',
'preferred_name': subfield_a,
'source': 'factum'
'source': 'Factum'
}
subjects = self.get('subjects', [])
subjects.append(subject)
Expand Down
Loading

0 comments on commit 90095aa

Please sign in to comment.