documents: store identifiers

* Closes rero#1885. * Improves json schema reference definitions. * Closes rero#1873, last bullet point of rero#1873 (comment). * Fixes startDate and endDate in the editor to accept integers between -9999 and 2050. Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rerowep · May 17, 2021 · b52e7d8 · b52e7d8
1 parent 297b335
commit b52e7d8
Show file tree

Hide file tree

Showing 17 changed files with 2,298 additions and 2,131 deletions.
diff --git a/data/documents_big.xml b/data/documents_big.xml
diff --git a/data/documents_small.xml b/data/documents_small.xml
diff --git a/rero_ils/dojson/utils.py b/rero_ils/dojson/utils.py
@@ -287,6 +287,9 @@
 }
 
 
+re_identified = re.compile(r'\((.*)\)(.*)')
+
+
 def error_print(*args):
     """Error printing to sdtout."""
     msg = ''
@@ -1778,3 +1781,23 @@ def build_responsibility_data(responsibility_data):
             index += 1
             responsibilities.append(out_data)
     return responsibilities
+
+
+def build_identifier(data):
+    """Build identifyBy for document_identifier-v0.0.1.json from $0.
+
+    :param data: data to build the identifiedBy from.
+    :returns: identifiedBy from $0 or None.
+    """
+    result = None
+    data_0 = data.get('0')
+    if data_0:
+        match = re_identified.match(data_0)
+        try:
+            result = {
+                'type': match.group(1),
+                'value': match.group(2)
+            }
+        except IndexError:
+            click.echo(f'WARNING creating identifier: {data_0}')
+    return result
diff --git a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
@@ -25,10 +25,10 @@
 from dojson.utils import GroupableOrderedDict
 
 from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
-    add_note, build_responsibility_data, build_string_from_subfields, \
-    error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
-    get_field_items, get_field_link_data, make_year, not_repetitive, \
-    remove_trailing_punctuation
+    add_note, build_identifier, build_responsibility_data, \
+    build_string_from_subfields, error_print, \
+    extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
+    get_field_link_data, make_year, not_repetitive, remove_trailing_punctuation
 
 _ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
     'a': 'rdami:1001',
@@ -207,7 +207,7 @@
     'filmage_ch': re.compile(r'^from the age of')
 }
 
-_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')
+_IDREF_REF_REGEX = re.compile(r'^\(IdRef\)(.*)?')
 
 
 marc21 = ReroIlsMarc21Overdo()
@@ -223,7 +223,8 @@ def get_contribution_link(bibid, reroid, id, key, value):
     match = _IDREF_REF_REGEX.search(id)
     if match:
         pid = match.group(1)
-        if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
+        if key[:3] in ['100', '600', '610', '611', '630', '650', '651', '655',
+                       '700', '710', '711']:
             # contribution
             url = f'{mef_url}idref/{pid}'
             try:
@@ -234,12 +235,12 @@ def get_contribution_link(bibid, reroid, id, key, value):
             if request.status_code == requests.codes.ok:
                 return url.replace(test_host, prod_host)
             else:
-                subfiels = []
+                subfields = []
                 for v, k in value.items():
                     if v != '__order__':
-                        subfiels.append(f'${v} {k}')
-                subfiels = ' '.join(subfiels)
-                field = f'{key} {subfiels}'
+                        subfields.append(f'${v} {k}')
+                subfields = ' '.join(subfields)
+                field = f'{key} {subfields}'
                 error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
                             bibid, reroid, field, url,
                             request.status_code)
@@ -506,9 +507,9 @@ def marc21_to_contribution(self, key, value):
         agent = {}
         if value.get('0'):
             refs = utils.force_list(value.get('0'))
-            for ref in refs:
+            if refs:
                 ref = get_contribution_link(
-                    marc21.bib_id, marc21.rero_id, ref, key, value)
+                    marc21.bib_id, marc21.rero_id, refs[0], key, value)
                 if ref:
                     agent['$ref'] = ref
                     if key[:3] in ['100', '700']:
@@ -563,6 +564,9 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if fuller_form_of_name:
                         agent['fuller_form_of_name'] = fuller_form_of_name
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
 
             # 710|711 Organisation
             elif key[:3] in ['710', '711']:
@@ -605,6 +609,10 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if place:
                         agent['place'] = place
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
+
 
         if value.get('4'):
             roles = []
@@ -791,6 +799,11 @@ def build_place():
             pass
 
         publication['statement'].append(date)
+
+    identifier = build_identifier(value)
+    if identifier:
+        publication['identifiedBy'] = identifier
+
     return publication or None
 
 
@@ -1460,7 +1473,9 @@ def marc21_to_subjects(self, key, value):
     }
     source_per_prefix = {
         '(RERO)': 'rero',
-        '(IDREF)': 'idref'
+        '(IdRef)': 'idref',
+        '(GND)': 'gnd',
+        '(RERO-RAMEAU)': 'rero-rameau',
     }
 
     source_per_indicator_2 = {
@@ -1475,66 +1490,72 @@ def marc21_to_subjects(self, key, value):
     if subfields_2:
         subfield_2 = subfields_2[0]
     subfields_a = utils.force_list(value.get('a', []))
-    source_prefix = ''
 
     if subfield_2 == 'rero':
-        # TODO: create a link to MEF when possible
         has_dollar_t = value.get('t')
 
         subfields_0 = utils.force_list(value.get('0'))
         subfield_0 = None
         identified_by = None
         if subfields_0:
+            ref = get_contribution_link(
+                marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)
+
             #  remove the source prefix in parenthesis like '(RERO)'
             source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
             subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
-            source = source_per_prefix[source_prefix]
+            source = source_per_prefix.get(source_prefix, source_prefix)
             identified_by = {
                 'value': subfield_0,
                 'source': source,
                 'type': 'bf:Local'
             }
-
-        if tag_key in ('600', '610', '611') and has_dollar_t:
-            tag_key += 't'
-
         data_type = type_per_tag[tag_key]
-        start_with_digit = False
-        if tag_key == '650':
-            for subfield_a in subfields_a:
-                start_with_digit_regexp = re.compile(r'^\d')
-                match = start_with_digit_regexp.search(subfield_a)
-                if match:
-                    start_with_digit = True
-                    data_type = 'bf:Temporal'
-                    break
+        if ref:
+            subject = {
+                '$ref': ref,
+                'type': data_type,
+            }
+        else:
+            if tag_key in ('600', '610', '611') and has_dollar_t:
+                tag_key += 't'
+
+            start_with_digit = False
+            if tag_key == '650':
+                for subfield_a in subfields_a:
+                    start_with_digit_regexp = re.compile(r'^\d')
+                    match = start_with_digit_regexp.search(subfield_a)
+                    if match:
+                        start_with_digit = True
+                        data_type = 'bf:Temporal'
+                        break
 
-        subject = {
-            'source': 'rero',
-            'type': data_type,
-        }
-        if identified_by:
-            subject['identifiedBy'] = identified_by
+            subject = {
+                'source': 'rero',
+                'type': data_type,
+            }
+            if identified_by:
+                subject['identifiedBy'] = identified_by
 
-        string_build = build_string_from_subfields(
-            value,
-            subfield_code_per_tag[tag_key])
-        if (tag_key == '655'):
-            # remove the square brackets
-            string_build = re.sub(r'^\[(.*)\]$', r'\1', string_build)
-        subject[field_data_per_tag[tag_key]] = string_build
-
-        if tag_key in ('610', '611'):
-            subject['conference'] = conference_per_tag[tag_key]
-
-        if tag_key in ('600t', '610t', '611t'):
-            creator_tag_key = tag_key[:3]  # to keep only tag:  600, 610, 611
-            subject['creator'] = remove_trailing_punctuation(
-                build_string_from_subfields(
-                    value,
-                    subfield_code_per_tag[creator_tag_key]),
-                '.', '.'
-            )
+            string_build = build_string_from_subfields(
+                value,
+                subfield_code_per_tag[tag_key])
+            if (tag_key == '655'):
+                # remove the square brackets
+                string_build = re.sub(r'^\[(.*)\]$', r'\1', string_build)
+            subject[field_data_per_tag[tag_key]] = string_build
+
+            if tag_key in ('610', '611'):
+                subject['conference'] = conference_per_tag[tag_key]
+
+            if tag_key in ('600t', '610t', '611t'):
+                creator_tag_key = tag_key[:3]  # to keep only tag:  600, 610, 611
+                subject['creator'] = remove_trailing_punctuation(
+                    build_string_from_subfields(
+                        value,
+                        subfield_code_per_tag[creator_tag_key]),
+                    '.', '.'
+                )
         field_key = 'subjects'
         if tag_key == '655':
             field_key = 'genreForm'