From e6b2683364d8a28d130c330bd3da5caa4f9d12d8 Mon Sep 17 00:00:00 2001 From: Renaud Michotte Date: Thu, 14 Nov 2019 13:07:30 +0100 Subject: [PATCH] persons: fixes removing persons records when document is suppressed * Fix indexer schema mapping for mef authorities * Reforctoring of document listeners using MefPerson class * Closes #601 Co-Authored-by: Renaud Michotte --- rero_ils/modules/documents/api.py | 6 +- rero_ils/modules/documents/listener.py | 132 ++++++++----------------- rero_ils/modules/ext.py | 4 +- rero_ils/modules/indexer_utils.py | 5 + rero_ils/modules/mef_persons/api.py | 41 +------- rero_ils/modules/mef_persons/utils.py | 22 +++++ 6 files changed, 82 insertions(+), 128 deletions(-) create mode 100644 rero_ils/modules/mef_persons/utils.py diff --git a/rero_ils/modules/documents/api.py b/rero_ils/modules/documents/api.py index 7efe15d465..2dded167da 100644 --- a/rero_ils/modules/documents/api.py +++ b/rero_ils/modules/documents/api.py @@ -90,6 +90,10 @@ def can_edit(self): # TODO: Make this condition on data return not self.harvested + def get_linked_authors(self): + """Get a list containing all authors linked to an authority""" + return [a['$ref'] for a in self.get('authors', []) if a.get('$ref')] + def get_number_of_items(self): """Get number of items for document.""" from ..items.api import ItemsSearch @@ -138,6 +142,6 @@ def dumps(self, **kwargs): provision_activity["_text"] = \ publication_statement_text(provision_activity) series = dump.get('series') - for series_element in series: + for series_element in series or []: series_element["_text"] = series_format_text(series_element) return dump diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py index 70317be9be..015e04188f 100644 --- a/rero_ils/modules/documents/listener.py +++ b/rero_ils/modules/documents/listener.py @@ -17,12 +17,8 @@ """Signals connector for Document.""" -from flask import current_app -from invenio_indexer.api import RecordIndexer -from invenio_jsonschemas import current_jsonschemas -from invenio_search import current_search -from requests import codes as requests_codes -from requests import get as requests_get + +from elasticsearch.exceptions import NotFoundError from .views import create_publication_statement from ..documents.api import DocumentsSearch @@ -30,6 +26,8 @@ from ..item_types.api import ItemType from ..items.api import Item from ..locations.api import Location +from ..mef_persons.api import MefPerson +from ..mef_persons.utils import resolve_mef_uri from ..organisations.api import Organisation @@ -128,91 +126,47 @@ def mef_person_revert(sender, *args, **kwargs): def mef_person_update_index(sender, *args, **kwargs): - """Index MEF person in ES.""" + """Index MEF person in ES. + + When indexing a document, this function will check if some author are + authorities. Foreach authority, this function will check if it is already + created ; if not thne the authority will be created + + :param record: the document being processed + """ record = kwargs['record'] - if 'documents' in record.get('$schema', ''): - authors = record.get('authors', []) - for author in authors: - mef_url = author.get('$ref') - if mef_url: - mef_url = mef_url.replace( - 'mef.rero.ch', - current_app.config['RERO_ILS_MEF_HOST'] - ) - request = requests_get(url=mef_url, params=dict( - resolve=1, - sources=1 - )) - if request.status_code == requests_codes.ok: - data = request.json() - id = data['id'] - data = data.get('metadata') - if data: - data['id'] = id - data['$schema'] = current_jsonschemas.path_to_url( - current_app.config[ - 'RERO_ILS_PERSONS_MEF_SCHEMA' - ] - ) - indexer = RecordIndexer() - index, doc_type = indexer.record_to_index(data) - indexer.client.index( - id=id, - index=index, - doc_type=doc_type, - body=data, - ) - current_search.flush_and_refresh(index) - else: - current_app.logger.error( - 'Mef resolver request error: {stat} {url}'.format( - stat=request.status_code, - url=mef_url - ) - ) - raise Exception('unable to resolve') + if 'documents' not in record.get('$schema', ''): + return + + refs = [a['$ref'] for a in record.get('authors', []) if a.get('$ref')] + for author_ref in refs: + data = resolve_mef_uri(author_ref) + if data and data.get('id'): + try: + person = MefPerson.get_record_by_pid(data['id']) + person.reindex() + except NotFoundError: + print("author_ref [{r}] not found, create it ...".format(r=author_ref)) + person = MefPerson.create(data) + person.reindex() def mef_person_delete(sender, *args, **kwargs): - """Delete signal.""" + """Delete signal about a document + + When deleting a document, we need to check if some authors are + authorities. If one of these authority has no other document linked to it + then this authority must be deleted from index. + + :param record: the document being suppressed + """ record = kwargs['record'] - if 'documents' in record.get('$schema', ''): - authors = record.get('authors', []) - for author in authors: - mef_url = author.get('$ref') - if mef_url: - mef_url = mef_url.replace( - 'mef.rero.ch', - current_app.config['RERO_ILS_MEF_HOST'] - ) - request = requests_get(url=mef_url, params=dict( - resolve=1, - sources=1 - )) - if request.status_code == requests_codes.ok: - data = request.json() - id = data['id'] - data = data.get('metadata') - if data: - search = DocumentsSearch() - count = search.filter( - 'match', - authors__pid=id - ).execute().hits.total - if count == 1: - indexer = RecordIndexer() - index, doc_type = indexer.record_to_index(data) - indexer.client.delete( - id=id, - index=index, - doc_type=doc_type - ) - current_search.flush_and_refresh(index) - else: - current_app.logger.error( - 'Mef resolver request error: {result} {url}'.format( - result=request.status_code, - url=mef_url - ) - ) - raise Exception('unable to resolve') + if 'documents' not in record.get('$schema', ''): + return + + for author_ref in record.get_linked_authors(): + data = resolve_mef_uri(author_ref) + if data and data.get('id'): + person = MefPerson.get_record(data['id']) + if len(person.get_linked_document_pids()) == 1: + person.delete() diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py index a6a29cc8cd..cdd16e976c 100644 --- a/rero_ils/modules/ext.py +++ b/rero_ils/modules/ext.py @@ -25,7 +25,7 @@ from invenio_indexer.signals import before_record_index from invenio_oaiharvester.signals import oaiharvest_finished from invenio_records.signals import after_record_delete, after_record_insert, \ - after_record_revert, after_record_update + after_record_revert, after_record_update, before_record_delete from .documents.listener import enrich_document_data, mef_person_delete, \ mef_person_insert, mef_person_revert, mef_person_update @@ -115,5 +115,5 @@ def register_signals(self): after_record_insert.connect(mef_person_insert) after_record_update.connect(mef_person_update) - after_record_delete.connect(mef_person_delete) + before_record_delete.connect(mef_person_delete) after_record_revert.connect(mef_person_revert) diff --git a/rero_ils/modules/indexer_utils.py b/rero_ils/modules/indexer_utils.py index 6d5eefa2e8..df56a353d3 100644 --- a/rero_ils/modules/indexer_utils.py +++ b/rero_ils/modules/indexer_utils.py @@ -41,6 +41,11 @@ def record_to_index(record): # put all document in the same index if re.search(r'/documents/', schema): schema = re.sub(r'-.*\.json', '.json', schema) + # authorities specific transformation + if re.search(r'/authorities/', schema): + schema = re.sub(r'/authorities/', '/persons/', schema) + schema = re.sub(r'mef-person', 'mef_person', schema) + index, doc_type = schema_to_index(schema, index_names=index_names) if index and doc_type: diff --git a/rero_ils/modules/mef_persons/api.py b/rero_ils/modules/mef_persons/api.py index f778d08bab..9716a9b99a 100644 --- a/rero_ils/modules/mef_persons/api.py +++ b/rero_ils/modules/mef_persons/api.py @@ -64,13 +64,10 @@ class MefPerson(ElasticsearchRecord): @classmethod def get_record_by_pid(cls, pid): """Get elasticsearch record by pid value.""" - - #print(cls.get_documents_by_person_pid(pid)) - pers = MefPerson(cls.get_record(pid)) return MefPerson(cls.get_record(pid)) - def get_number_of_linked_documents(self, org_pid=None): - """Get number of linked documents for person.""" + def get_linked_document_pids(self, org_pid=None): + """Get list of linked document pids for person.""" search = DocumentsSearch() search = search.filter( 'term', @@ -80,36 +77,8 @@ def get_number_of_linked_documents(self, org_pid=None): search = search.filter( 'term', holdings__organisation__organisation_pid=org_pid ) - results = search.source().count() - return results + return [hit.pid for hit in search.scan()] def get_linked_documents(self, org_pid=None): - """Get linked documents.""" - search = DocumentsSearch() - search = search.filter( - 'term', - authors__pid=self.pid - ) - if org_pid: - search = search.filter( - 'term', holdings__organisation__organisation_pid=org_pid - ) - - return [result.get('_source') for result in search.execute().hits.hits] - - @classmethod - def get_record_by_mef_uri(cls, uri): - """Get elasticsearch record by mef uri.""" - mef_url = uri.replace( - 'mef.rero.ch', - current_app.config['RERO_ILS_MEF_HOST'] - ) - request = requests_get(url=mef_url, params=dict( - resolve=1, - sources=1 - )) - if request.status_code == requests_codes.ok: - data = request.json() - id = data['id'] - return MefPerson(cls.get_record(id)) - + for pid in self.get_linked_document_pids(): + yield Document.get_record_by_pid(pid) diff --git a/rero_ils/modules/mef_persons/utils.py b/rero_ils/modules/mef_persons/utils.py new file mode 100644 index 0000000000..a12cab7830 --- /dev/null +++ b/rero_ils/modules/mef_persons/utils.py @@ -0,0 +1,22 @@ +from flask import current_app +from requests import codes as requests_codes +from requests import get as requests_get + + +def resolve_mef_uri(uri): + """Resolve a mef uri to get associated data. + + Call the external resource corresponding to the uri and get the associated + data if data are valid + :return associated uri data as a dictionnary ; Return None if resolution + failed or data are inconsistant + """ + mef_url = uri.replace( + 'mef.rero.ch', + current_app.config['RERO_ILS_MEF_HOST'] + ) + r = requests_get(url=mef_url, params={'resolve': 1, 'sources': 1}) + if r.status_code == requests_codes.ok: + data = r.json() + if data.get('id'): + return data