Skip to content

Commit

Permalink
documents: store identifiers
Browse files Browse the repository at this point in the history
* Closes rero#1885.
* Improves json schema reference definitions.
* Closes rero#1873, last bullet point of rero#1873 (comment).
* Fixes startDate and endDate in the editor to accept integers between
  -9999 and 2050.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep and rerowep committed May 17, 2021
1 parent 297b335 commit b52e7d8
Show file tree
Hide file tree
Showing 17 changed files with 2,298 additions and 2,131 deletions.
2,596 changes: 1,298 additions & 1,298 deletions data/documents_big.xml

Large diffs are not rendered by default.

718 changes: 359 additions & 359 deletions data/documents_small.xml

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,9 @@
}


re_identified = re.compile(r'\((.*)\)(.*)')


def error_print(*args):
"""Error printing to sdtout."""
msg = ''
Expand Down Expand Up @@ -1778,3 +1781,23 @@ def build_responsibility_data(responsibility_data):
index += 1
responsibilities.append(out_data)
return responsibilities


def build_identifier(data):
"""Build identifyBy for document_identifier-v0.0.1.json from $0.
:param data: data to build the identifiedBy from.
:returns: identifiedBy from $0 or None.
"""
result = None
data_0 = data.get('0')
if data_0:
match = re_identified.match(data_0)
try:
result = {
'type': match.group(1),
'value': match.group(2)
}
except IndexError:
click.echo(f'WARNING creating identifier: {data_0}')
return result
129 changes: 75 additions & 54 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
from dojson.utils import GroupableOrderedDict

from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
add_note, build_responsibility_data, build_string_from_subfields, \
error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
get_field_items, get_field_link_data, make_year, not_repetitive, \
remove_trailing_punctuation
add_note, build_identifier, build_responsibility_data, \
build_string_from_subfields, error_print, \
extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
get_field_link_data, make_year, not_repetitive, remove_trailing_punctuation

_ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
'a': 'rdami:1001',
Expand Down Expand Up @@ -207,7 +207,7 @@
'filmage_ch': re.compile(r'^from the age of')
}

_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')
_IDREF_REF_REGEX = re.compile(r'^\(IdRef\)(.*)?')


marc21 = ReroIlsMarc21Overdo()
Expand All @@ -223,7 +223,8 @@ def get_contribution_link(bibid, reroid, id, key, value):
match = _IDREF_REF_REGEX.search(id)
if match:
pid = match.group(1)
if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
if key[:3] in ['100', '600', '610', '611', '630', '650', '651', '655',
'700', '710', '711']:
# contribution
url = f'{mef_url}idref/{pid}'
try:
Expand All @@ -234,12 +235,12 @@ def get_contribution_link(bibid, reroid, id, key, value):
if request.status_code == requests.codes.ok:
return url.replace(test_host, prod_host)
else:
subfiels = []
subfields = []
for v, k in value.items():
if v != '__order__':
subfiels.append(f'${v} {k}')
subfiels = ' '.join(subfiels)
field = f'{key} {subfiels}'
subfields.append(f'${v} {k}')
subfields = ' '.join(subfields)
field = f'{key} {subfields}'
error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
bibid, reroid, field, url,
request.status_code)
Expand Down Expand Up @@ -506,9 +507,9 @@ def marc21_to_contribution(self, key, value):
agent = {}
if value.get('0'):
refs = utils.force_list(value.get('0'))
for ref in refs:
if refs:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, ref, key, value)
marc21.bib_id, marc21.rero_id, refs[0], key, value)
if ref:
agent['$ref'] = ref
if key[:3] in ['100', '700']:
Expand Down Expand Up @@ -563,6 +564,9 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if fuller_form_of_name:
agent['fuller_form_of_name'] = fuller_form_of_name
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier

# 710|711 Organisation
elif key[:3] in ['710', '711']:
Expand Down Expand Up @@ -605,6 +609,10 @@ def marc21_to_contribution(self, key, value):
).lstrip('(').rstrip(')')
if place:
agent['place'] = place
identifier = build_identifier(value)
if identifier:
agent['identifiedBy'] = identifier


if value.get('4'):
roles = []
Expand Down Expand Up @@ -791,6 +799,11 @@ def build_place():
pass

publication['statement'].append(date)

identifier = build_identifier(value)
if identifier:
publication['identifiedBy'] = identifier

return publication or None


Expand Down Expand Up @@ -1460,7 +1473,9 @@ def marc21_to_subjects(self, key, value):
}
source_per_prefix = {
'(RERO)': 'rero',
'(IDREF)': 'idref'
'(IdRef)': 'idref',
'(GND)': 'gnd',
'(RERO-RAMEAU)': 'rero-rameau',
}

source_per_indicator_2 = {
Expand All @@ -1475,66 +1490,72 @@ def marc21_to_subjects(self, key, value):
if subfields_2:
subfield_2 = subfields_2[0]
subfields_a = utils.force_list(value.get('a', []))
source_prefix = ''

if subfield_2 == 'rero':
# TODO: create a link to MEF when possible
has_dollar_t = value.get('t')

subfields_0 = utils.force_list(value.get('0'))
subfield_0 = None
identified_by = None
if subfields_0:
ref = get_contribution_link(
marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)

# remove the source prefix in parenthesis like '(RERO)'
source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
source = source_per_prefix[source_prefix]
source = source_per_prefix.get(source_prefix, source_prefix)
identified_by = {
'value': subfield_0,
'source': source,
'type': 'bf:Local'
}

if tag_key in ('600', '610', '611') and has_dollar_t:
tag_key += 't'

data_type = type_per_tag[tag_key]
start_with_digit = False
if tag_key == '650':
for subfield_a in subfields_a:
start_with_digit_regexp = re.compile(r'^\d')
match = start_with_digit_regexp.search(subfield_a)
if match:
start_with_digit = True
data_type = 'bf:Temporal'
break
if ref:
subject = {
'$ref': ref,
'type': data_type,
}
else:
if tag_key in ('600', '610', '611') and has_dollar_t:
tag_key += 't'

start_with_digit = False
if tag_key == '650':
for subfield_a in subfields_a:
start_with_digit_regexp = re.compile(r'^\d')
match = start_with_digit_regexp.search(subfield_a)
if match:
start_with_digit = True
data_type = 'bf:Temporal'
break

subject = {
'source': 'rero',
'type': data_type,
}
if identified_by:
subject['identifiedBy'] = identified_by
subject = {
'source': 'rero',
'type': data_type,
}
if identified_by:
subject['identifiedBy'] = identified_by

string_build = build_string_from_subfields(
value,
subfield_code_per_tag[tag_key])
if (tag_key == '655'):
# remove the square brackets
string_build = re.sub(r'^\[(.*)\]$', r'\1', string_build)
subject[field_data_per_tag[tag_key]] = string_build

if tag_key in ('610', '611'):
subject['conference'] = conference_per_tag[tag_key]

if tag_key in ('600t', '610t', '611t'):
creator_tag_key = tag_key[:3] # to keep only tag: 600, 610, 611
subject['creator'] = remove_trailing_punctuation(
build_string_from_subfields(
value,
subfield_code_per_tag[creator_tag_key]),
'.', '.'
)
string_build = build_string_from_subfields(
value,
subfield_code_per_tag[tag_key])
if (tag_key == '655'):
# remove the square brackets
string_build = re.sub(r'^\[(.*)\]$', r'\1', string_build)
subject[field_data_per_tag[tag_key]] = string_build

if tag_key in ('610', '611'):
subject['conference'] = conference_per_tag[tag_key]

if tag_key in ('600t', '610t', '611t'):
creator_tag_key = tag_key[:3] # to keep only tag: 600, 610, 611
subject['creator'] = remove_trailing_punctuation(
build_string_from_subfields(
value,
subfield_code_per_tag[creator_tag_key]),
'.', '.'
)
field_key = 'subjects'
if tag_key == '655':
field_key = 'genreForm'
Expand Down
Loading

0 comments on commit b52e7d8

Please sign in to comment.