Skip to content

Commit

Permalink
documents: store identifiers
Browse files Browse the repository at this point in the history
* Closes rero#1885.
* Improves json schema reference definitions.
* Closes rero#1873, last bullet point of rero#1873 (comment).
* Adds identifiedBY values to subjexts, genreForm and contribution if possible.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep and rerowep committed May 17, 2021
1 parent 297b335 commit aaad122
Show file tree
Hide file tree
Showing 19 changed files with 2,824 additions and 2,638 deletions.
825 changes: 400 additions & 425 deletions data/documents_big.json

Large diffs are not rendered by default.

2,596 changes: 1,298 additions & 1,298 deletions data/documents_big.xml

Large diffs are not rendered by default.

246 changes: 130 additions & 116 deletions data/documents_small.json

Large diffs are not rendered by default.

718 changes: 359 additions & 359 deletions data/documents_small.xml

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@
}


re_identified = re.compile(r'\((.*)\)(.*)')


def error_print(*args):
"""Error printing to sdtout."""
msg = ''
Expand Down Expand Up @@ -1778,3 +1781,23 @@ def build_responsibility_data(responsibility_data):
index += 1
responsibilities.append(out_data)
return responsibilities


def build_identifier(data):
"""Build identifyBy for document_identifier-v0.0.1.json from $0.
:param data: data to build the identifiedBy from.
:returns: identifiedBy from $0 or None.
"""
result = None
data_0 = data.get('0')
if data_0:
match = re_identified.match(data_0)
try:
result = {
'type': match.group(1),
'value': match.group(2)
}
except IndexError:
click.echo(f'WARNING creating identifier: {data_0}')
return result
59 changes: 41 additions & 18 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from dojson.utils import GroupableOrderedDict

from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
add_note, build_responsibility_data, build_string_from_subfields, \
error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
get_field_items, get_field_link_data, make_year, not_repetitive, \
remove_trailing_punctuation
add_note, build_identifier, build_responsibility_data, \
build_string_from_subfields, error_print, \
extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
get_field_link_data, make_year, not_repetitive, remove_trailing_punctuation

_ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
'a': 'rdami:1001',
Expand Down Expand Up @@ -207,7 +207,7 @@
'filmage_ch': re.compile(r'^from the age of')
}

_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')
_IDREF_REF_REGEX = re.compile(r'^\(IdRef\)(.*)?')


marc21 = ReroIlsMarc21Overdo()
Expand All @@ -223,7 +223,8 @@ def get_contribution_link(bibid, reroid, id, key, value):
match = _IDREF_REF_REGEX.search(id)
if match:
pid = match.group(1)
if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
if key[:3] in ['100', '600', '610', '611', '630', '650', '651', '655',
'700', '710', '711']:
# contribution
url = f'{mef_url}idref/{pid}'
try:
Expand All @@ -234,12 +235,12 @@ def get_contribution_link(bibid, reroid, id, key, value):
if request.status_code == requests.codes.ok:
return url.replace(test_host, prod_host)
else:
subfiels = []
subfields = []
for v, k in value.items():
if v != '__order__':
subfiels.append(f'${v} {k}')
subfiels = ' '.join(subfiels)
field = f'{key} {subfiels}'
subfields.append(f'${v} {k}')
subfields = ' '.join(subfields)
field = f'{key} {subfields}'
error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
bibid, reroid, field, url,
request.status_code)
Expand Down Expand Up @@ -506,9 +507,9 @@ def marc21_to_contribution(self, key, value):
agent = {}
if value.get('0'):
refs = utils.force_list(value.get('0'))
for ref in refs:
if refs:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, ref, key, value)
marc21.bib_id, marc21.rero_id, refs[0], key, value)
if ref:
agent['$ref'] = ref
if key[:3] in ['100', '700']:
Expand Down Expand Up @@ -563,6 +564,9 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if fuller_form_of_name:
agent['fuller_form_of_name'] = fuller_form_of_name
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier

# 710|711 Organisation
elif key[:3] in ['710', '711']:
Expand Down Expand Up @@ -605,6 +609,10 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if place:
agent['place'] = place
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier


if value.get('4'):
roles = []
Expand Down Expand Up @@ -769,6 +777,7 @@ def build_place():
place = build_place()
if place:
publication['place'] = [place]

publication['statement'] = build_statement(value, ind2)
if subfields_c:
subfield_c = subfields_c[0]
Expand All @@ -791,6 +800,11 @@ def build_place():
pass

publication['statement'].append(date)

identifier = build_identifier(value)
if identifier:
publication['identifiedBy'] = identifier

return publication or None


Expand Down Expand Up @@ -1460,7 +1474,9 @@ def marc21_to_subjects(self, key, value):
}
source_per_prefix = {
'(RERO)': 'rero',
'(IDREF)': 'idref'
'(IdRef)': 'idref',
'(GND)': 'gnd',
'(RERO-RAMEAU)': 'rero-rameau',
}

source_per_indicator_2 = {
Expand All @@ -1475,10 +1491,8 @@ def marc21_to_subjects(self, key, value):
if subfields_2:
subfield_2 = subfields_2[0]
subfields_a = utils.force_list(value.get('a', []))
source_prefix = ''

if subfield_2 == 'rero':
# TODO: create a link to MEF when possible
has_dollar_t = value.get('t')

subfields_0 = utils.force_list(value.get('0'))
Expand All @@ -1488,7 +1502,7 @@ def marc21_to_subjects(self, key, value):
# remove the source prefix in parenthesis like '(RERO)'
source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
source = source_per_prefix[source_prefix]
source = source_per_prefix.get(source_prefix, source_prefix)
identified_by = {
'value': subfield_0,
'source': source,
Expand All @@ -1497,8 +1511,8 @@ def marc21_to_subjects(self, key, value):

if tag_key in ('600', '610', '611') and has_dollar_t:
tag_key += 't'

data_type = type_per_tag[tag_key]

start_with_digit = False
if tag_key == '650':
for subfield_a in subfields_a:
Expand Down Expand Up @@ -1539,7 +1553,16 @@ def marc21_to_subjects(self, key, value):
if tag_key == '655':
field_key = 'genreForm'

if subject[field_data_per_tag[tag_key]]:
if data_type in ['bf:Person', 'bf:Organisation'] and subfields_0:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)
if ref:
subject = {
'$ref': ref,
'type': data_type,
}

if subject.get('$ref') or subject.get(field_data_per_tag[tag_key]):
subjects = self.get(field_key, [])
subjects.append(subject)
self[field_key] = subjects
Expand Down
Loading

0 comments on commit aaad122

Please sign in to comment.