documents: store source of authority identifiers

* Closes rero#1885. * Improves json schema reference definitions. * Closes rero#1873, last bullet point of rero#1873 (comment). * Adds identifiedBY values to subjects, genreForm and contribution if possible. Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rerowep · May 19, 2021 · 90095aa · 90095aa
1 parent efca4ad
commit 90095aa
Show file tree

Hide file tree

Showing 20 changed files with 3,535 additions and 6,139 deletions.
diff --git a/data/documents_big.json b/data/documents_big.json
diff --git a/data/documents_big.xml b/data/documents_big.xml
diff --git a/data/documents_small.json b/data/documents_small.json
diff --git a/data/documents_small.xml b/data/documents_small.xml
diff --git a/rero_ils/dojson/utils.py b/rero_ils/dojson/utils.py
@@ -287,6 +287,9 @@
 }
 
 
+re_identified = re.compile(r'\((.*)\)(.*)')
+
+
 def error_print(*args):
     """Error printing to sdtout."""
     msg = ''
@@ -1778,3 +1781,32 @@ def build_responsibility_data(responsibility_data):
             index += 1
             responsibilities.append(out_data)
     return responsibilities
+
+
+def build_identifier(data):
+    """Build identifyBy for document_identifier-v0.0.1.json from $0.
+
+    :param data: data to build the identifiedBy from.
+    :returns: identifiedBy from $0 or None.
+    """
+    sources = {
+        'RERO': 'RERO',
+        'RERO-RAMEAU': 'RERO-RAMEAU',
+        'IDREF': 'IdRef',
+        'GND': 'GND'
+    }
+    result = {}
+    data_0 = utils.force_list(data.get('0'))
+    if data_0:
+        match = re_identified.match(data_0[0])
+        try:
+            result['value'] = match.group(2)
+            identifier_type = sources.get(match.group(1).upper())
+            if identifier_type:
+                result['type'] = identifier_type
+            else:
+                result['type'] = 'bf:Local'
+                result['source'] = match.group(1)
+        except IndexError:
+            click.echo(f'WARNING creating identifier: {data_0}')
+    return result or None
diff --git a/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py b/rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
@@ -25,10 +25,12 @@
 from dojson.utils import GroupableOrderedDict
 
 from rero_ils.dojson.utils import ReroIlsMarc21Overdo, TitlePartList, \
-    add_note, build_responsibility_data, build_string_from_subfields, \
-    error_print, extract_subtitle_and_parallel_titles_from_field_245_b, \
-    get_field_items, get_field_link_data, make_year, not_repetitive, \
+    add_note, build_identifier, build_responsibility_data, \
+    build_string_from_subfields, error_print, \
+    extract_subtitle_and_parallel_titles_from_field_245_b, get_field_items, \
+    get_field_link_data, make_year, not_repetitive, \
     remove_trailing_punctuation
+from rero_ils.modules.utils import requests_retry_session
 
 _ISSUANCE_MAIN_TYPE_PER_BIB_LEVEL = {
     'a': 'rdami:1001',
@@ -207,8 +209,10 @@
     'filmage_ch': re.compile(r'^from the age of')
 }
 
-_IDREF_REF_REGEX = re.compile(r'^\(IDREF\)(.*)?')
-
+_IDREF_REF_REGEX = re.compile(r'^(?i)\(IdRef\)(.*)?')
+_RERO_REF_REGEX = re.compile(r'^(?i)\(RERO\)(.*)?')
+_CONTRIBUTION_TAGS = ['100', '600', '610', '611', '630', '650', '651',
+                             '655', '700', '710', '711']
 
 marc21 = ReroIlsMarc21Overdo()
 
@@ -223,26 +227,39 @@ def get_contribution_link(bibid, reroid, id, key, value):
     match = _IDREF_REF_REGEX.search(id)
     if match:
         pid = match.group(1)
-        if key[:3] in ['100', '600', '610', '611', '700', '710', '711']:
-            # contribution
+        if key[:3] in _CONTRIBUTION_TAGS:
             url = f'{mef_url}idref/{pid}'
-            try:
-                request = requests.get(url=url)
-            except requests.exceptions.RequestException as err:
-                error_print('ERROR MEF ACCESS:', bibid, reroid, url, err)
-                return None
-            if request.status_code == requests.codes.ok:
+            response = requests_retry_session().get(url)
+            if response.status_code == requests.codes.ok:
                 return url.replace(test_host, prod_host)
-            else:
-                subfiels = []
-                for v, k in value.items():
-                    if v != '__order__':
-                        subfiels.append(f'${v} {k}')
-                subfiels = ' '.join(subfiels)
-                field = f'{key} {subfiels}'
-                error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
-                            bibid, reroid, field, url,
-                            request.status_code)
+            subfields = []
+            for v, k in value.items():
+                if v != '__order__':
+                    subfields.append(f'${v} {k}')
+            subfields = ' '.join(subfields)
+            field = f'{key} {subfields}'
+            error_print('WARNING MEF CONTRIBUTION IDREF NOT FOUND:',
+                        bibid, reroid, field, url, response.status_code)
+    else:
+        # Try to get the MEF IdRef url from RERO id.
+        match = _RERO_REF_REGEX.search(id)
+        if match:
+            pid = match.group(1)
+            if key[:3] in _CONTRIBUTION_TAGS:
+                url = f'{mef_url}rero/{pid}'
+                response = requests_retry_session().get(url)
+                if response.status_code == requests.codes.ok:
+                    data = response.json()
+                    mef_url = data.get('links', {}).get('mef')
+                    response = requests_retry_session().get(mef_url)
+                    if response.status_code == requests.codes.ok:
+                        data = response.json()
+                        idref_url = data.get(
+                            'metadata', {}).get('idref', {}).get('$ref')
+                        if idref_url:
+                            return idref_url.replace(test_host, prod_host)
+
+    error_print('WARNING MEF CONTRIBUTION GET:', bibid, reroid, key, id)
 
 
 @marc21.over('issuance', 'leader')
@@ -504,17 +521,16 @@ def marc21_to_contribution(self, key, value):
     """Get contribution."""
     if not key[4] == '2' and key[:3] in ['100', '700', '710', '711']:
         agent = {}
-        if value.get('0'):
-            refs = utils.force_list(value.get('0'))
-            for ref in refs:
-                ref = get_contribution_link(
-                    marc21.bib_id, marc21.rero_id, ref, key, value)
-                if ref:
-                    agent['$ref'] = ref
-                    if key[:3] in ['100', '700']:
-                        agent['type'] = 'bf:Person'
-                    elif key[:3] in ['710', '711']:
-                        agent['type'] = 'bf:Organisation'
+        refs = utils.force_list(value.get('0'))
+        if refs:
+            ref = get_contribution_link(
+                marc21.bib_id, marc21.rero_id, refs[0], key, value)
+            if ref:
+                agent['$ref'] = ref
+                if key[:3] in ['100', '700']:
+                    agent['type'] = 'bf:Person'
+                elif key[:3] in ['710', '711']:
+                    agent['type'] = 'bf:Organisation'
 
         # we do not have a $ref
         if not agent.get('$ref') and value.get('a'):
@@ -563,6 +579,9 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if fuller_form_of_name:
                         agent['fuller_form_of_name'] = fuller_form_of_name
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
 
             # 710|711 Organisation
             elif key[:3] in ['710', '711']:
@@ -605,6 +624,9 @@ def marc21_to_contribution(self, key, value):
                     ).lstrip('(').rstrip(')')
                     if place:
                         agent['place'] = place
+                identifier = build_identifier(value)
+                if identifier:
+                    agent['identifiedBy'] = identifier
 
         if value.get('4'):
             roles = []
@@ -769,6 +791,7 @@ def build_place():
         place = build_place()
         if place:
             publication['place'] = [place]
+
     publication['statement'] = build_statement(value, ind2)
     if subfields_c:
         subfield_c = subfields_c[0]
@@ -791,6 +814,11 @@ def build_place():
             pass
 
         publication['statement'].append(date)
+
+    identifier = build_identifier(value)
+    if identifier:
+        publication['identifiedBy'] = identifier
+
     return publication or None
 
 
@@ -1458,11 +1486,6 @@ def marc21_to_subjects(self, key, value):
         '610': False,
         '611': True
     }
-    source_per_prefix = {
-        '(RERO)': 'rero',
-        '(IDREF)': 'idref'
-    }
-
     source_per_indicator_2 = {
         '0': 'LCSH',
         '2': 'MeSH'
@@ -1475,46 +1498,26 @@ def marc21_to_subjects(self, key, value):
     if subfields_2:
         subfield_2 = subfields_2[0]
     subfields_a = utils.force_list(value.get('a', []))
-    source_prefix = ''
 
     if subfield_2 == 'rero':
-        # TODO: create a link to MEF when possible
         has_dollar_t = value.get('t')
 
-        subfields_0 = utils.force_list(value.get('0'))
-        subfield_0 = None
-        identified_by = None
-        if subfields_0:
-            #  remove the source prefix in parenthesis like '(RERO)'
-            source_prefix = re.sub(r'^(\(.*\)).*$', r'\1', subfields_0[0])
-            subfield_0 = re.sub(r'^\(.*\)(.*)$', r'\1', subfields_0[0])
-            source = source_per_prefix[source_prefix]
-            identified_by = {
-                'value': subfield_0,
-                'source': source,
-                'type': 'bf:Local'
-            }
-
         if tag_key in ('600', '610', '611') and has_dollar_t:
             tag_key += 't'
-
         data_type = type_per_tag[tag_key]
+
         start_with_digit = False
         if tag_key == '650':
             for subfield_a in subfields_a:
                 start_with_digit_regexp = re.compile(r'^\d')
                 match = start_with_digit_regexp.search(subfield_a)
                 if match:
-                    start_with_digit = True
                     data_type = 'bf:Temporal'
                     break
 
         subject = {
-            'source': 'rero',
             'type': data_type,
         }
-        if identified_by:
-            subject['identifiedBy'] = identified_by
 
         string_build = build_string_from_subfields(
             value,
@@ -1539,7 +1542,21 @@ def marc21_to_subjects(self, key, value):
         if tag_key == '655':
             field_key = 'genreForm'
 
-        if subject[field_data_per_tag[tag_key]]:
+        subfields_0 = utils.force_list(value.get('0'))
+        if data_type in ['bf:Person', 'bf:Organisation'] and subfields_0:
+            ref = get_contribution_link(
+                marc21.bib_id, marc21.rero_id, subfields_0[0], key, value)
+            if ref:
+                subject = {
+                    '$ref': ref,
+                    'type': data_type,
+                }
+        if not subject.get('$ref'):
+            identifier = build_identifier(value)
+            if identifier:
+                subject['identifiedBy'] = identifier
+
+        if subject.get('$ref') or subject.get(field_data_per_tag[tag_key]):
             subjects = self.get(field_key, [])
             subjects.append(subject)
             self[field_key] = subjects
@@ -1694,7 +1711,7 @@ def get_classif_type_and_subdivision_codes_from_980_2(subfield_2):
                 subject = {
                     'type': 'bf:Person',
                     'preferred_name': subfield_a,
-                    'source': 'factum'
+                    'source': 'Factum'
                 }
                 subjects = self.get('subjects', [])
                 subjects.append(subject)