From 057647d0613638c62346c1a28344e208fca1863b Mon Sep 17 00:00:00 2001 From: Peter Weber Date: Tue, 24 Oct 2023 11:20:35 +0200 Subject: [PATCH] entity: better replace_identified_by * Corrects handling of rero only and not found identifiers. Co-Authored-by: Peter Weber --- .../modules/entities/remote_entities/cli.py | 16 +++++--- .../entities/remote_entities/replace.py | 38 ++++++++++++------- .../test_remote_entities_api.py | 27 +++++++++---- 3 files changed, 53 insertions(+), 28 deletions(-) diff --git a/rero_ils/modules/entities/remote_entities/cli.py b/rero_ils/modules/entities/remote_entities/cli.py index e97b09489d..2b78698881 100644 --- a/rero_ils/modules/entities/remote_entities/cli.py +++ b/rero_ils/modules/entities/remote_entities/cli.py @@ -135,11 +135,15 @@ def replace_identified_by_cli(field, dry_run, verbose, log_dir): fg='green' ) if verbose: - if replace_identified_by.not_found: + if replace_identified_by._error_count( + replace_identified_by.not_found): click.secho('Not found:', fg='yellow') - for pid, data in replace_identified_by.not_found.items(): - click.echo(f'\t{pid}: {data}') - if replace_identified_by.rero_only: + for etype, values in replace_identified_by.not_found.items(): + for pid, data in values.items(): + click.echo(f'\t{etype} {pid}: {data}') + if replace_identified_by._error_count( + replace_identified_by.rero_only): click.secho('RERO only:', fg='yellow') - for pid, data in replace_identified_by.rero_only.items(): - click.echo(f'\t{pid}: {data}') + for etype, values in replace_identified_by.rero_only.items(): + for pid, data in values.items(): + click.echo(f'\t{pid}: {data}') diff --git a/rero_ils/modules/entities/remote_entities/replace.py b/rero_ils/modules/entities/remote_entities/replace.py index e838311bce..70686e7963 100644 --- a/rero_ils/modules/entities/remote_entities/replace.py +++ b/rero_ils/modules/entities/remote_entities/replace.py @@ -24,6 +24,7 @@ import requests from flask import current_app +from sqlalchemy.orm.exc import NoResultFound from rero_ils.modules.documents.api import Document, DocumentsSearch from rero_ils.modules.utils import get_mef_url, get_timestamp, \ @@ -157,13 +158,15 @@ def _do_entity(self, entity, doc_pid): """ changed = False doc_entity_type = entity['entity']['type'] + self.not_found.setdefault(doc_entity_type, {}) + self.rero_only.setdefault(doc_entity_type, {}) if mef_type := self.entity_types.get(doc_entity_type): source_pid = entity['entity']['identifiedBy']['value'] source = entity['entity']['identifiedBy']['type'].lower() identifier = f'{source}:{source_pid}' if ( - identifier in self.not_found or - identifier in self.rero_only + identifier in self.not_found[doc_entity_type] or + identifier in self.rero_only[doc_entity_type] ): # MEF was not found previously. Do not try it again. return None @@ -200,7 +203,7 @@ def _do_entity(self, entity, doc_pid): f'{doc_entity_type} != {mef_entity_type} ' f': "{authorized_access_point}"' ) - self.rero_only[identifier] = info + self.rero_only[doc_entity_type][identifier] = info self.logger.warning( f'Type differ:{doc_pid} ' f'{self.field} - ({mef_type}) {identifier} {info}' @@ -208,8 +211,8 @@ def _do_entity(self, entity, doc_pid): else: authorized_access_point = mef_data.get( source, {}).get('authorized_access_point') - info = f'{doc_entity_type}: {authorized_access_point}' - self.rero_only[identifier] = info + info = f'{authorized_access_point}' + self.rero_only[doc_entity_type][identifier] = info self.logger.info( f'No other source found for document:{doc_pid} ' f'{self.field} - ({mef_type}|{doc_entity_type}) ' @@ -218,12 +221,14 @@ def _do_entity(self, entity, doc_pid): else: authorized_access_point = entity[ 'entity']['authorized_access_point'] - info = f'{doc_entity_type}: {authorized_access_point}' - self.not_found[identifier] = info + info = f'{authorized_access_point}' + self.not_found[doc_entity_type][identifier] = info self.logger.info( f'No MEF found for document:{doc_pid} ' f' - ({mef_type}) {identifier} "{info}"' ) + self.not_found = {k: v for k, v in self.not_found.items() if v} + self.rero_only = {k: v for k, v in self.rero_only.items() if v} return changed def _replace_entities_in_document(self, doc_id): @@ -232,7 +237,7 @@ def _replace_entities_in_document(self, doc_id): :param doc_id: (string) document id """ changed = False - with contextlib.suppress(Exception): + with contextlib.suppress(NoResultFound): doc = Document.get_record(doc_id) entities_to_update = filter( lambda c: c.get('entity', {}).get('identifiedBy'), @@ -248,6 +253,10 @@ def _replace_entities_in_document(self, doc_id): if changed: return doc + def _error_count(self, counter_dict): + """Summ of error count.""" + return sum(len(values) for values in counter_dict.values()) + def run(self): """Replace identifiedBy with $ref.""" self.changed = 0 @@ -256,16 +265,17 @@ def run(self): self.logger.info( f'Found {self.field} identifiedBy: {self.count()}') query = self.query \ - .params(preserve_order=True) \ - .sort({'_created': {'order': 'asc'}}) \ - .source(['pid', self.field]) + .params(preserve_order=True) \ + .sort({'_created': {'order': 'asc'}}) \ + .source(['pid', self.field]) for hit in list(query.scan()): if doc := self._replace_entities_in_document(hit.meta.id): self.changed += 1 if not self.dry_run: doc.update(data=doc, dbcommit=True, reindex=True) self.set_timestamp() - return self.changed, len(self.not_found), len(self.rero_only) + return self.changed, self._error_count(self.not_found), \ + self._error_count(self.rero_only) def get_timestamp(self): """Get time stamp.""" @@ -287,8 +297,8 @@ def set_timestamp(self): # rero only: entity was found but has only `rero` as source. data[self.field] = { 'changed': self.changed, - 'not found': len(self.not_found), - 'rero only': len(self.rero_only), + 'not found': self._error_count(self.not_found), + 'rero only': self._error_count(self.rero_only), 'time': datetime.now(timezone.utc), } set_timestamp(self.timestamp_name, **data) diff --git a/tests/ui/entities/remote_entities/test_remote_entities_api.py b/tests/ui/entities/remote_entities/test_remote_entities_api.py index 1e1f44111e..eb0cbc73d0 100644 --- a/tests/ui/entities/remote_entities/test_remote_entities_api.py +++ b/tests/ui/entities/remote_entities/test_remote_entities_api.py @@ -380,9 +380,13 @@ def test_replace_identified_by( assert not_found == 2 assert rero_only == 0 assert replace_identified_by.not_found == { - 'gnd:1161956409': 'bf:Organisation: Convegno internazionale ' - 'di italianistica Craiova', - 'rero:A003633163': 'bf:Person: Nebehay, Christian Michael' + 'bf:Organisation': { + 'gnd:1161956409': 'Convegno internazionale ' + 'di italianistica Craiova' + }, + 'bf:Person': { + 'rero:A003633163': 'Nebehay, Christian Michael' + } } replace_identified_by.set_timestamp() data = replace_identified_by.get_timestamp() @@ -404,7 +408,9 @@ def test_replace_identified_by( assert not_found == 0 assert rero_only == 1 assert replace_identified_by.rero_only == { - 'rero:A003633163': 'bf:Person: Nebehay, Christian Michael' + 'bf:Person': { + 'rero:A003633163': 'Nebehay, Christian Michael' + } } # with MEF response for concepts in subjects replace_identified_by = ReplaceIdentifiedBy( @@ -434,10 +440,15 @@ def test_replace_identified_by( assert not_found == 0 assert rero_only == 3 assert dict(sorted(replace_identified_by.rero_only.items())) == { - 'rero:A009963344': - 'bf:Person: Athenagoras (patriarche oecuménique ; 1)', - 'rero:A021039750': 'bf:Topic: Bases de données déductives', - 'rero:A009975209': 'bf:Place: Europe occidentale' + 'bf:Person': { + 'rero:A009963344': 'Athenagoras (patriarche oecuménique ; 1)' + }, + 'bf:Topic': { + 'rero:A021039750': 'Bases de données déductives' + }, + 'bf:Place': { + 'rero:A009975209': 'Europe occidentale' + } }