documents: store identifiers

* Closes rero#1885. * Improves json schema reference definitions. * Closes rero#1873, last bullet point of rero#1873 (comment). * Adds identifiedBY values to subjexts, genreForm and contribution if possible. Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rerowep · May 17, 2021 · aaad122 · aaad122
1 parent 297b335
commit aaad122
Show file tree

Hide file tree

Showing 19 changed files with 2,824 additions and 2,638 deletions.
diff --git a/data/documents_big.json b/data/documents_big.json
diff --git a/data/documents_big.xml b/data/documents_big.xml
diff --git a/data/documents_small.json b/data/documents_small.json
diff --git a/data/documents_small.xml b/data/documents_small.xml
diff --git a/rero_ils/dojson/utils.py b/rero_ils/dojson/utils.py
@@ -287,6 +287,9 @@
 }
 
 
+re_identified = re.compile(r'\((.*)\)(.*)')
+
+
 def error_print(*args):
     """Error printing to sdtout."""
     msg = ''
@@ -1778,3 +1781,23 @@ def build_responsibility_data(responsibility_data):
             index += 1
             responsibilities.append(out_data)
     return responsibilities
+
+
+def build_identifier(data):
+    """Build identifyBy for document_identifier-v0.0.1.json from $0.
+
+    :param data: data to build the identifiedBy from.
+    :returns: identifiedBy from $0 or None.
+    """
+    result = None
+    data_0 = data.get('0')
+    if data_0:
+        match = re_identified.match(data_0)
+        try:
+            result = {
+                'type': match.group(1),
+                'value': match.group(2)
+            }
+        except IndexError:
+            click.echo(f'WARNING creating identifier: {data_0}')
+    return result
diff --git a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
@@ -25,10 +25,10 @@
 from dojson.utils import GroupableOrderedDict
 
 from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
-    add_note, build_responsibility_data, build_string_from_subfields, \
-    error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
-    get_field_items, get_field_link_data, make_year, not_repetitive, \
-    remove_trailing_punctuation
+    add_note, build_identifier, build_responsibility_data, \
+    build_string_from_subfields, error_print, \
+    extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
+    get_field_link_data, make_year, not_repetitive, remove_trailing_punctuation
 
 _ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
     'a': 'rdami:1001',
@@ -207,7 +207,7 @@
     'filmage_ch': re.compile(r'^from the age of')
 }
 
-_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')
+_IDREF_REF_REGEX = re.compile(r'^\(IdRef\)(.*)?')
 
 
 marc21 = ReroIlsMarc21Overdo()
@@ -223,7 +223,8 @@ def get_contribution_link(bibid, reroid, id, key, value):
     match = _IDREF_REF_REGEX.search(id)
     if match:
         pid = match.group(1)
-        if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
+        if key[:3] in ['100', '600', '610', '611', '630', '650', '651', '655',
+                       '700', '710', '711']:
             # contribution
             url = f'{mef_url}idref/{pid}'
             try:
@@ -234,12 +235,12 @@ def get_contribution_link(bibid, reroid, id, key, value):
             if request.status_code == requests.codes.ok:
                 return url.replace(test_host, prod_host)
             else:
-                subfiels = []
+                subfields = []
                 for v, k in value.items():
                     if v != '__order__':
-                        subfiels.append(f'${v} {k}')
-                subfiels = ' '.join(subfiels)
-                field = f'{key} {subfiels}'
+                        subfields.append(f'${v} {k}')
+                subfields = ' '.join(subfields)
+                field = f'{key} {subfields}'
                 error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
                             bibid, reroid, field, url,
                             request.status_code)
@@ -506,9 +507,9 @@ def marc21_to_contribution(self, key, value):
         agent = {}
         if value.get('0'):
             refs = utils.force_list(value.get('0'))
-            for ref in refs:
+            if refs:
                 ref = get_contribution_link(
-                    marc21.bib_id, marc21.rero_id, ref, key, value)
+                    marc21.bib_id, marc21.rero_id, refs[0], key, value)
                 if ref:
                     agent['$ref'] = ref
                     if key[:3] in ['100', '700']:
@@ -563,6 +564,9 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if fuller_form_of_name:
                         agent['fuller_form_of_name'] = fuller_form_of_name
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
 
             # 710|711 Organisation
             elif key[:3] in ['710', '711']:
@@ -605,6 +609,10 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if place:
                         agent['place'] = place
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
+
 
         if value.get('4'):
             roles = []
@@ -769,6 +777,7 @@ def build_place():
         place = build_place()
         if place:
             publication['place'] = [place]
+
     publication['statement'] = build_statement(value, ind2)
     if subfields_c:
         subfield_c = subfields_c[0]
@@ -791,6 +800,11 @@ def build_place():
             pass
 
         publication['statement'].append(date)
+
+    identifier = build_identifier(value)
+    if identifier:
+        publication['identifiedBy'] = identifier
+
     return publication or None
 
 
@@ -1460,7 +1474,9 @@ def marc21_to_subjects(self, key, value):
     }
     source_per_prefix = {
         '(RERO)': 'rero',
-        '(IDREF)': 'idref'
+        '(IdRef)': 'idref',
+        '(GND)': 'gnd',
+        '(RERO-RAMEAU)': 'rero-rameau',
     }
 
     source_per_indicator_2 = {
@@ -1475,10 +1491,8 @@ def marc21_to_subjects(self, key, value):
     if subfields_2:
         subfield_2 = subfields_2[0]
     subfields_a = utils.force_list(value.get('a', []))
-    source_prefix = ''
 
     if subfield_2 == 'rero':
-        # TODO: create a link to MEF when possible
         has_dollar_t = value.get('t')
 
         subfields_0 = utils.force_list(value.get('0'))
@@ -1488,7 +1502,7 @@ def marc21_to_subjects(self, key, value):
             #  remove the source prefix in parenthesis like '(RERO)'
             source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
             subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
-            source = source_per_prefix[source_prefix]
+            source = source_per_prefix.get(source_prefix, source_prefix)
             identified_by = {
                 'value': subfield_0,
                 'source': source,
@@ -1497,8 +1511,8 @@ def marc21_to_subjects(self, key, value):
 
         if tag_key in ('600', '610', '611') and has_dollar_t:
             tag_key += 't'
-
         data_type = type_per_tag[tag_key]
+
         start_with_digit = False
         if tag_key == '650':
             for subfield_a in subfields_a:
@@ -1539,7 +1553,16 @@ def marc21_to_subjects(self, key, value):
         if tag_key == '655':
             field_key = 'genreForm'
 
-        if subject[field_data_per_tag[tag_key]]:
+        if data_type in ['bf:Person', 'bf:Organisation'] and subfields_0:
+            ref = get_contribution_link(
+                marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)
+            if ref:
+                subject = {
+                    '$ref': ref,
+                    'type': data_type,
+                }
+
+        if subject.get('$ref') or subject.get(field_data_per_tag[tag_key]):
             subjects = self.get(field_key, [])
             subjects.append(subject)
             self[field_key] = subjects