Skip to content

Commit

Permalink
entity: better replace_identified_by
Browse files Browse the repository at this point in the history
* Corrects handling of rero only and not found identifiers.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep committed Oct 24, 2023
1 parent 63376d9 commit 0745b02
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 20 deletions.
16 changes: 10 additions & 6 deletions rero_ils/modules/entities/remote_entities/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,15 @@ def replace_identified_by_cli(field, dry_run, verbose, log_dir):
fg='green'
)
if verbose:
if replace_identified_by.not_found:
if replace_identified_by._error_count(
replace_identified_by.not_found):
click.secho('Not found:', fg='yellow')
for pid, data in replace_identified_by.not_found.items():
click.echo(f'\t{pid}: {data}')
if replace_identified_by.rero_only:
for etype, values in replace_identified_by.not_found.items():
for pid, data in values.items():
click.echo(f'\t{etype} {pid}: {data}')
if replace_identified_by._error_count(
replace_identified_by.rero_only):
click.secho('RERO only:', fg='yellow')
for pid, data in replace_identified_by.rero_only.items():
click.echo(f'\t{pid}: {data}')
for etype, values in replace_identified_by.rero_only.items():
for pid, data in values.items():
click.echo(f'\t{pid}: {data}')
33 changes: 19 additions & 14 deletions rero_ils/modules/entities/remote_entities/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

"""Replace identifiedBy with $ref from MEF."""

import contextlib
from copy import deepcopy
from datetime import datetime, timezone

Expand Down Expand Up @@ -157,13 +156,15 @@ def _do_entity(self, entity, doc_pid):
"""
changed = False
doc_entity_type = entity['entity']['type']
self.not_found.setdefault(doc_entity_type, {})
self.rero_only.setdefault(doc_entity_type, {})
if mef_type := self.entity_types.get(doc_entity_type):
source_pid = entity['entity']['identifiedBy']['value']
source = entity['entity']['identifiedBy']['type'].lower()
identifier = f'{source}:{source_pid}'
if (
identifier in self.not_found or
identifier in self.rero_only
identifier in self.not_found[doc_entity_type] or
identifier in self.rero_only[doc_entity_type]
):
# MEF was not found previously. Do not try it again.
return None
Expand Down Expand Up @@ -200,7 +201,7 @@ def _do_entity(self, entity, doc_pid):
f'{doc_entity_type} != {mef_entity_type} '
f': "{authorized_access_point}"'
)
self.rero_only[identifier] = info
self.rero_only[doc_entity_type][identifier] = info
self.logger.warning(
f'Type differ:{doc_pid} '
f'{self.field} - ({mef_type}) {identifier} {info}'
Expand All @@ -209,7 +210,7 @@ def _do_entity(self, entity, doc_pid):
authorized_access_point = mef_data.get(
source, {}).get('authorized_access_point')
info = f'{doc_entity_type}: {authorized_access_point}'
self.rero_only[identifier] = info
self.rero_only[doc_entity_type][identifier] = info
self.logger.info(
f'No other source found for document:{doc_pid} '
f'{self.field} - ({mef_type}|{doc_entity_type}) '
Expand All @@ -219,7 +220,7 @@ def _do_entity(self, entity, doc_pid):
authorized_access_point = entity[
'entity']['authorized_access_point']
info = f'{doc_entity_type}: {authorized_access_point}'
self.not_found[identifier] = info
self.not_found[doc_entity_type][identifier] = info
self.logger.info(
f'No MEF found for document:{doc_pid} '
f' - ({mef_type}) {identifier} "{info}"'
Expand All @@ -232,8 +233,7 @@ def _replace_entities_in_document(self, doc_id):
:param doc_id: (string) document id
"""
changed = False
with contextlib.suppress(Exception):
doc = Document.get_record(doc_id)
if doc := Document.get_record(doc_id):
entities_to_update = filter(
lambda c: c.get('entity', {}).get('identifiedBy'),
doc.get(self.field, {})
Expand All @@ -248,6 +248,10 @@ def _replace_entities_in_document(self, doc_id):
if changed:
return doc

def _error_count(self, counter_dict):
"""Summ of error count."""
return sum(len(values) for values in counter_dict.values())

def run(self):
"""Replace identifiedBy with $ref."""
self.changed = 0
Expand All @@ -256,16 +260,17 @@ def run(self):
self.logger.info(
f'Found {self.field} identifiedBy: {self.count()}')
query = self.query \
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
for hit in list(query.scan()):
if doc := self._replace_entities_in_document(hit.meta.id):
self.changed += 1
if not self.dry_run:
doc.update(data=doc, dbcommit=True, reindex=True)
self.set_timestamp()
return self.changed, len(self.not_found), len(self.rero_only)
return self.changed, self._error_count(self.not_found), \
self._error_count(self.rero_only)

def get_timestamp(self):
"""Get time stamp."""
Expand All @@ -287,8 +292,8 @@ def set_timestamp(self):
# rero only: entity was found but has only `rero` as source.
data[self.field] = {
'changed': self.changed,
'not found': len(self.not_found),
'rero only': len(self.rero_only),
'not found': self._error_count(self.not_found),
'rero only': self._error_count(self.rero_only),
'time': datetime.now(timezone.utc),
}
set_timestamp(self.timestamp_name, **data)

0 comments on commit 0745b02

Please sign in to comment.