Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

entity: better replace_identified_by #3497

Merged
merged 1 commit into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions rero_ils/modules/entities/remote_entities/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,11 +135,15 @@ def replace_identified_by_cli(field, dry_run, verbose, log_dir):
fg='green'
)
if verbose:
if replace_identified_by.not_found:
if replace_identified_by._error_count(
replace_identified_by.not_found):
click.secho('Not found:', fg='yellow')
for pid, data in replace_identified_by.not_found.items():
click.echo(f'\t{pid}: {data}')
if replace_identified_by.rero_only:
for etype, values in replace_identified_by.not_found.items():
for pid, data in values.items():
click.echo(f'\t{etype} {pid}: {data}')
if replace_identified_by._error_count(
replace_identified_by.rero_only):
click.secho('RERO only:', fg='yellow')
for pid, data in replace_identified_by.rero_only.items():
click.echo(f'\t{pid}: {data}')
for etype, values in replace_identified_by.rero_only.items():
for pid, data in values.items():
click.echo(f'\t{pid}: {data}')
38 changes: 24 additions & 14 deletions rero_ils/modules/entities/remote_entities/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import requests
from flask import current_app
from sqlalchemy.orm.exc import NoResultFound

from rero_ils.modules.documents.api import Document, DocumentsSearch
from rero_ils.modules.utils import get_mef_url, get_timestamp, \
Expand Down Expand Up @@ -157,13 +158,15 @@ def _do_entity(self, entity, doc_pid):
"""
changed = False
doc_entity_type = entity['entity']['type']
self.not_found.setdefault(doc_entity_type, {})
self.rero_only.setdefault(doc_entity_type, {})
if mef_type := self.entity_types.get(doc_entity_type):
source_pid = entity['entity']['identifiedBy']['value']
source = entity['entity']['identifiedBy']['type'].lower()
identifier = f'{source}:{source_pid}'
if (
identifier in self.not_found or
identifier in self.rero_only
identifier in self.not_found[doc_entity_type] or
identifier in self.rero_only[doc_entity_type]
):
# MEF was not found previously. Do not try it again.
return None
Expand Down Expand Up @@ -200,16 +203,16 @@ def _do_entity(self, entity, doc_pid):
f'{doc_entity_type} != {mef_entity_type} '
f': "{authorized_access_point}"'
)
self.rero_only[identifier] = info
self.rero_only[doc_entity_type][identifier] = info
self.logger.warning(
f'Type differ:{doc_pid} '
f'{self.field} - ({mef_type}) {identifier} {info}'
)
else:
authorized_access_point = mef_data.get(
source, {}).get('authorized_access_point')
info = f'{doc_entity_type}: {authorized_access_point}'
self.rero_only[identifier] = info
info = f'{authorized_access_point}'
self.rero_only[doc_entity_type][identifier] = info
self.logger.info(
f'No other source found for document:{doc_pid} '
f'{self.field} - ({mef_type}|{doc_entity_type}) '
Expand All @@ -218,12 +221,14 @@ def _do_entity(self, entity, doc_pid):
else:
authorized_access_point = entity[
'entity']['authorized_access_point']
info = f'{doc_entity_type}: {authorized_access_point}'
self.not_found[identifier] = info
info = f'{authorized_access_point}'
self.not_found[doc_entity_type][identifier] = info
self.logger.info(
f'No MEF found for document:{doc_pid} '
f' - ({mef_type}) {identifier} "{info}"'
)
self.not_found = {k: v for k, v in self.not_found.items() if v}
self.rero_only = {k: v for k, v in self.rero_only.items() if v}
return changed

def _replace_entities_in_document(self, doc_id):
Expand All @@ -232,7 +237,7 @@ def _replace_entities_in_document(self, doc_id):
:param doc_id: (string) document id
"""
changed = False
with contextlib.suppress(Exception):
with contextlib.suppress(NoResultFound):
doc = Document.get_record(doc_id)
entities_to_update = filter(
lambda c: c.get('entity', {}).get('identifiedBy'),
Expand All @@ -248,6 +253,10 @@ def _replace_entities_in_document(self, doc_id):
if changed:
return doc

def _error_count(self, counter_dict):
"""Summ of error count."""
return sum(len(values) for values in counter_dict.values())

def run(self):
"""Replace identifiedBy with $ref."""
self.changed = 0
Expand All @@ -256,16 +265,17 @@ def run(self):
self.logger.info(
f'Found {self.field} identifiedBy: {self.count()}')
query = self.query \
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
.params(preserve_order=True) \
.sort({'_created': {'order': 'asc'}}) \
.source(['pid', self.field])
for hit in list(query.scan()):
if doc := self._replace_entities_in_document(hit.meta.id):
self.changed += 1
if not self.dry_run:
doc.update(data=doc, dbcommit=True, reindex=True)
self.set_timestamp()
return self.changed, len(self.not_found), len(self.rero_only)
return self.changed, self._error_count(self.not_found), \
self._error_count(self.rero_only)

def get_timestamp(self):
"""Get time stamp."""
Expand All @@ -287,8 +297,8 @@ def set_timestamp(self):
# rero only: entity was found but has only `rero` as source.
data[self.field] = {
'changed': self.changed,
'not found': len(self.not_found),
'rero only': len(self.rero_only),
'not found': self._error_count(self.not_found),
'rero only': self._error_count(self.rero_only),
'time': datetime.now(timezone.utc),
}
set_timestamp(self.timestamp_name, **data)
27 changes: 19 additions & 8 deletions tests/ui/entities/remote_entities/test_remote_entities_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,13 @@ def test_replace_identified_by(
assert not_found == 2
assert rero_only == 0
assert replace_identified_by.not_found == {
'gnd:1161956409': 'bf:Organisation: Convegno internazionale '
'di italianistica Craiova',
'rero:A003633163': 'bf:Person: Nebehay, Christian Michael'
'bf:Organisation': {
'gnd:1161956409': 'Convegno internazionale '
'di italianistica Craiova'
},
'bf:Person': {
'rero:A003633163': 'Nebehay, Christian Michael'
}
}
replace_identified_by.set_timestamp()
data = replace_identified_by.get_timestamp()
Expand All @@ -404,7 +408,9 @@ def test_replace_identified_by(
assert not_found == 0
assert rero_only == 1
assert replace_identified_by.rero_only == {
'rero:A003633163': 'bf:Person: Nebehay, Christian Michael'
'bf:Person': {
'rero:A003633163': 'Nebehay, Christian Michael'
}
}
# with MEF response for concepts in subjects
replace_identified_by = ReplaceIdentifiedBy(
Expand Down Expand Up @@ -434,10 +440,15 @@ def test_replace_identified_by(
assert not_found == 0
assert rero_only == 3
assert dict(sorted(replace_identified_by.rero_only.items())) == {
'rero:A009963344':
'bf:Person: Athenagoras (patriarche oecuménique ; 1)',
'rero:A021039750': 'bf:Topic: Bases de données déductives',
'rero:A009975209': 'bf:Place: Europe occidentale'
'bf:Person': {
'rero:A009963344': 'Athenagoras (patriarche oecuménique ; 1)'
},
'bf:Topic': {
'rero:A021039750': 'Bases de données déductives'
},
'bf:Place': {
'rero:A009975209': 'Europe occidentale'
}
}


Expand Down