From b22b609b2ef14e3b08f58723cf8129ec52d097a9 Mon Sep 17 00:00:00 2001 From: Bertrand Zuchuat Date: Wed, 15 Sep 2021 15:35:15 +0200 Subject: [PATCH] data: implement stop words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds a stop word based normalizer. * Adds fields on the documents for sorting. * Adds fields on the contributions for sorting. * Adds elasticsearch configuration for sorting on resources. * Adds language mapping for code conversion. * Closes #2396. Co-Authored-by: Bertrand Zuchuat Co-Authored-by: Johnny Mariéthoz --- rero_ils/config.py | 136 +++++++++++++++--- rero_ils/es_templates/v7/record.json | 51 +++++++ .../v7/contributions/contribution-v0.0.1.json | 27 +++- rero_ils/modules/documents/listener.py | 27 +++- .../v7/documents/document-v0.0.1.json | 33 ++--- rero_ils/modules/ext.py | 2 + rero_ils/modules/normalizer_stop_words.py | 73 ++++++++++ rero_ils/utils.py | 12 ++ tests/unit/test_normalizer_stop_words.py | 67 +++++++++ tests/unit/test_utils.py | 9 +- 10 files changed, 384 insertions(+), 53 deletions(-) create mode 100644 rero_ils/modules/normalizer_stop_words.py create mode 100644 tests/unit/test_normalizer_stop_words.py diff --git a/rero_ils/config.py b/rero_ils/config.py index a48f4e32da..d1a17a8c79 100644 --- a/rero_ils/config.py +++ b/rero_ils/config.py @@ -2205,6 +2205,58 @@ def _(x): RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict( query='bestmatch', noquery='name') +# ------ COLLECTIONS SORT +RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict( + fields=['start_date', 'title_sort'], title='Start date and title', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict( + fields=['title_sort'], title='title', + default_order='asc' +) +RECORDS_REST_DEFAULT_SORT['collections'] = dict( + query='bestmatch', noquery='start_date') + +# ------ CONTRIBUTIONS SORT +RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict( + fields=[ + 'idref_authorized_access_point_sort', + 'rero_authorized_access_point_sort', + 'gnd_authorized_access_point_sort', + ], + title='Collection french name', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict( + fields=[ + 'gnd_authorized_access_point_sort', + 'idref_authorized_access_point_sort', + 'rero_authorized_access_point_sort' + ], + title='Collection german name', + default_order='asc' +) + +# ------ DOCUMENTS SORT +RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict( + fields=['sort_title'], title='Document title', + default_order='asc' +) + +RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict( + fields=[ + '-provision_activity_end_date', + '-provision_activity_start_date' + ], + title='Document date (newest)', + default_order='desc' +) + +RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict( + fields=['provision_activity_start_date'], title='Document date (oldest)', + default_order='asc' +) + # ------ HOLDINGS SORT RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict( fields=['library.pid', 'location.pid'], @@ -2214,7 +2266,19 @@ def _(x): RECORDS_REST_DEFAULT_SORT['holdings'] = dict( query='bestmatch', noquery='library_location') -# ------ ITEM SORT +# ------ ITEMS SORT +RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict( + fields=['call_number'], title='Call Number', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict( + fields=['second_call_number'], title='Second call Number', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict( + fields=['issue.expected_date'], title='Issue expected date', + default_order='asc' +) RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict( fields=['-enumerationAndChronology'], title='Enumeration and Chronology', default_order='desc' @@ -2239,6 +2303,10 @@ def _(x): fields=['library_name'], title='Library name', default_order='asc' ) +RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict( + fields=['code'], title='Library code', + default_order='asc' +) RECORDS_REST_DEFAULT_SORT['libraries'] = dict( query='bestmatch', noquery='name') @@ -2282,19 +2350,6 @@ def _(x): RECORDS_REST_DEFAULT_SORT['patron_types'] = dict( query='bestmatch', noquery='name') -# ------ VENDORS SORT -RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict( - fields=['vendor_name'], title='Vendor name', - default_order='asc' -) -RECORDS_REST_DEFAULT_SORT['vendors'] = dict( - query='bestmatch', noquery='name') - -# ------ ITEMS SORT -RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict( - fields=['issue.expected_date'], title='Issue expected date', - default_order='asc' -) # ------ TEMPLATES SORT RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict( fields=['name_sort'], title='Template name', @@ -2303,18 +2358,14 @@ def _(x): RECORDS_REST_DEFAULT_SORT['templates'] = dict( query='bestmatch', noquery='name') -# ------ COLLECTIONS SORT -RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict( - fields=['start_date', 'title_sort'], title='Start date and title', - default_order='asc' -) -RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict( - fields=['title_sort'], title='title', +# ------ VENDORS SORT +RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict( + fields=['vendor_name'], title='Vendor name', default_order='asc' ) +RECORDS_REST_DEFAULT_SORT['vendors'] = dict( + query='bestmatch', noquery='name') -RECORDS_REST_DEFAULT_SORT['collections'] = dict( - query='bestmatch', noquery='start_date') # Detailed View Configuration # =========================== @@ -2806,3 +2857,42 @@ def _(x): # OAuth base template OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html' + +# STOP WORDS +# ========== +# ACTIVATE STOP WORDS NORMALIZATION +RERO_ILS_STOP_WORDS_ACTIVATE = True +# PUNCTUATION +RERO_ILS_STOP_WORDS_PUNCTUATION = [ + r'\[', r'\]', '"', ',', ';', ':', r'\.', '_', + r'\?', r'\!', r'\*', r'\+', '\n' +] +# STOP WORDS BY LANGUAGE +# Possibility to add a default configuration with a "default" entry. +# This default configuration will be used if the language is not present +RERO_ILS_STOP_WORDS = { + 'dan': ["de", "den", "det", "en", "et"], + 'dut': [ + "d'", "de", "den", "der", "des", "het", "'s", "'t", "een", + "eene", "eener", "eens", "ene", "'n"], + 'eng': ["a", "an", "the"], + 'epo': ["la", "l'", "unu"], + 'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"], + 'ger': ["das", "dem", "den", "der", "des", "die"], + 'hun': ["a", "az", "egy"], + 'ita': [ + "gli", "i", "il", "l'", "la", "le", "li", "lo", "un", "un'", + "una", "uno"], + 'nor': ["de", "den", "det", "ei", "en", "et"], + 'por': ["a", "as", "o", "os", "um", "uma", "umas", "uns"], + 'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"], + 'swe': ["de", "den", "det", "en", "ett"] +} + +# LANGUAGE MAPPING +# ================ +RERO_ILS_LANGUAGE_MAPPING = { + 'dum': 'dut', # neerlandais + 'fra': 'fre', # french + 'nld': 'dut', # neerlandais +} diff --git a/rero_ils/es_templates/v7/record.json b/rero_ils/es_templates/v7/record.json index ae0cd26838..ba0a909ac8 100644 --- a/rero_ils/es_templates/v7/record.json +++ b/rero_ils/es_templates/v7/record.json @@ -6,6 +6,11 @@ "max_result_window": "100000", "analysis": { "filter": { + "edge_ngram_filter": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 10 + }, "french_elision": { "type": "elision", "articles_case": true, @@ -25,6 +30,32 @@ "puisqu" ] }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, "french_stemmer": { "type": "stemmer", "language": "light_french" @@ -65,6 +96,26 @@ "icu_folding", "german_normalization" ] + }, + "autocomplete": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "icu_normalizer", + "icu_folding", + "french_elision", + "italian_elision", + "edge_ngram_filter" + ] + } + }, + "normalizer": { + "sort_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] } } } diff --git a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json index ac1bf5afc1..3bc1b25499 100644 --- a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json +++ b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json @@ -29,6 +29,18 @@ "analyzer": "autocomplete", "search_analyzer": "standard" }, + "gnd_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, + "idref_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, + "rero_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, "$schema": { "type": "keyword" }, @@ -100,7 +112,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "gnd_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" @@ -203,7 +218,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "idref_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" @@ -306,7 +324,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "rero_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py index 0c78e6cd71..07fbc86ab3 100644 --- a/rero_ils/modules/documents/listener.py +++ b/rero_ils/modules/documents/listener.py @@ -17,6 +17,7 @@ """Signals connector for Document.""" +from flask.globals import current_app from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13 from .utils import create_contributions, title_format_text_head @@ -26,6 +27,7 @@ from ..items.models import ItemNoteTypes from ..local_fields.api import LocalField from ..utils import extracted_data_from_ref +from ...utils import language_mapping def enrich_document_data(sender, json=None, record=None, index=None, @@ -139,17 +141,24 @@ def enrich_document_data(sender, json=None, record=None, index=None, ) json['title'].append(title) - json['sort_title'] = title_format_text_head( + # sort title + sort_title = title_format_text_head( json.get('title', []), with_subtitle=True ) + language = language_mapping(json.get('language')[0].get('value')) + if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False): + sort_title = current_app.\ + extensions['reroils-normalizer-stop-words'].\ + normalize(sort_title, language) + json['sort_title'] = sort_title # Local fields in JSON local_fields = LocalField.get_local_fields_by_resource( 'doc', document_pid) if local_fields: json['local_fields'] = local_fields - # index both ISBN 10 and 13 format + # index both ISBN 10 and 13 format def filter_isbn(identified_by): """Filter identified_by for type bf:Isbn.""" return identified_by.get('type') == 'bf:Isbn' @@ -168,3 +177,17 @@ def filter_isbn(identified_by): isbns.add(to_isbn10(isbn)) if isbns: json['isbn'] = list(isbns) + + # Populate startDate and endDate for use in sorting + pub_provisions = [ + p for p in record.get('provisionActivity', []) + if p['type'] == 'bf:Publication' + ] + pub_provision = next(iter(pub_provisions), None) + if pub_provision: + if 'startDate' in pub_provision: + json['provision_activity_start_date'] = \ + pub_provision['startDate'] + if 'endDate' in pub_provision: + json['provision_activity_end_date'] = \ + pub_provision['endDate'] diff --git a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json index 22d8d2a1f9..778e2670b8 100644 --- a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json +++ b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json @@ -1,25 +1,4 @@ { - "settings": { - "analysis": { - "filter": { - "autocomplete_filter": { - "type": "edge_ngram", - "min_gram": 1, - "max_gram": 20 - } - }, - "analyzer": { - "autocomplete": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "autocomplete_filter" - ] - } - } - } - }, "mappings": { "date_detection": false, "numeric_detection": false, @@ -154,11 +133,11 @@ }, "autocomplete_title": { "type": "text", - "analyzer": "autocomplete", - "search_analyzer": "standard" + "analyzer": "autocomplete" }, "sort_title": { - "type": "keyword" + "type": "keyword", + "normalizer": "sort_normalizer" }, "responsibilityStatement": { "type": "object", @@ -368,6 +347,12 @@ } } }, + "provision_activity_start_date": { + "type": "integer" + }, + "provision_activity_end_date": { + "type": "integer" + }, "provisionActivity": { "type": "object", "properties": { diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py index b5816554c3..9960c99c3a 100644 --- a/rero_ils/modules/ext.py +++ b/rero_ils/modules/ext.py @@ -47,6 +47,7 @@ from .items.listener import enrich_item_data from .loans.listener import enrich_loan_data, listener_loan_state_changed from .locations.listener import enrich_location_data +from .normalizer_stop_words import NormalizerStopWords from .notifications.listener import enrich_notification_data from .operation_logs.listener import operation_log_record_create, \ operation_log_record_delete, operation_log_record_update @@ -93,6 +94,7 @@ def init_app(self, app): """Flask application initialization.""" Bootstrap(app) Wiki(app) + NormalizerStopWords(app) self.init_config(app) app.extensions['rero-ils'] = self self.register_import_api_blueprint(app) diff --git a/rero_ils/modules/normalizer_stop_words.py b/rero_ils/modules/normalizer_stop_words.py new file mode 100644 index 0000000000..2cef40bef2 --- /dev/null +++ b/rero_ils/modules/normalizer_stop_words.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Normalized sort for rero-ils.""" + +import re + + +class NormalizerStopWords(): + """Normalizer Stop words.""" + + stop_words_punctuation = [] + stop_words_regex = {} + + def __init__(self, app=None): + """Init.""" + self.app = app + if app is not None: + self.init_app(app) + + def init_app(self, app): + """Flask application initialization.""" + if app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False): + self.init_config(app) + app.extensions['reroils-normalizer-stop-words'] = self + + def init_config(self, app): + """Initialize configuration.""" + punc = app.config.get('RERO_ILS_STOP_WORDS_PUNCTUATION', []) + self.stop_words_punctuation = '|'.join(punc) + stop_words = app.config.get('RERO_ILS_STOP_WORDS', {}) + if stop_words != {}: + # Generating a regex per language + for lang in stop_words.keys(): + if lang in stop_words: + self.stop_words_regex[lang] = \ + r'\b(' + r'|'.join(stop_words[lang]) + r')\b\s*' + + def normalize(self, text, language=None): + """Normalize. + + :param text: Text to be normalized + :param language: Language of the text + :returns: Normalized text + """ + word_regex = None + if language in self.stop_words_regex: + word_regex = self.stop_words_regex[language] + elif 'default' in self.stop_words_regex: + word_regex = self.stop_words_regex['default'] + if word_regex: + compiled = re.compile(fr'{word_regex}', re.IGNORECASE) + text = compiled.sub('', text) + if self.stop_words_punctuation: + punc_regex = self.stop_words_punctuation + compiled = re.compile( + fr'{punc_regex}', re.IGNORECASE) + text = compiled.sub('', text) + return re.sub(r'\s+', ' ', text).strip() diff --git a/rero_ils/utils.py b/rero_ils/utils.py index 4ff15d3ff8..f29381a39f 100644 --- a/rero_ils/utils.py +++ b/rero_ils/utils.py @@ -127,3 +127,15 @@ def language_iso639_2to1(lang): return default_ln supported_languages = [v[0] for v in current_i18n.get_languages()] return ln if ln in supported_languages else default_ln + + +def language_mapping(lang): + """Language mapping. + + :param lang: bibliographic language code + :returns: language mapping + """ + mapping = current_app.config.get('RERO_ILS_LANGUAGE_MAPPING', {}) + if lang in mapping: + return mapping[lang] + return lang diff --git a/tests/unit/test_normalizer_stop_words.py b/tests/unit/test_normalizer_stop_words.py new file mode 100644 index 0000000000..f75d205a67 --- /dev/null +++ b/tests/unit/test_normalizer_stop_words.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Normalizer stop words tests.""" + +from rero_ils.modules.normalizer_stop_words import NormalizerStopWords + + +def test_normalize(app): + """Test stop words normalize.""" + # ---- The string is not analyzed + app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = False + normalizer = NormalizerStopWords(app) + text = "L'été a été très chaud." + assert text == normalizer.normalize(text) + + # ---- The string is analyzed + app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = True + app.config['RERO_ILS_STOP_WORDS_PUNCTUATION'] = [ + '"', ',', ';', ':', r'\.', '_', r'\?', r'\!', r'\*', r'\+', '\n' + ] + normalizer = NormalizerStopWords(app) + text = "L'été a été très chaud." + text_norm = "L'été a été très chaud" + # The language is not defined. Removal of punctuation only. + assert text_norm == normalizer.normalize(text) + + # Deleting words for the defined language. + text_norm = "été a été très chaud" + app.config['RERO_ILS_STOP_WORDS'] = { + 'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"] + } + assert text_norm == normalizer.normalize(text, 'fre') + + text = 'Journal des tribunaux : jurisprudence fédérale. ' \ + '4, Droit pénal et procédure pénale' + text_norm = 'Journal tribunaux jurisprudence fédérale ' \ + '4 Droit pénal et procédure pénale' + assert text_norm == normalizer.normalize(text, 'fre') + + # The language was not found in the definition of stop words. + text = "He plays this musical phrase quite well." + text_norm = "He plays this musical phrase quite well" + assert text_norm == normalizer.normalize(text, 'eng') + + # Deleting words with the default definition. + text = "L'été a été très chaud." + text_norm = "été a été chaud" + app.config['RERO_ILS_STOP_WORDS'] = { + 'default': ["l'", "très"] + } + normalizer = NormalizerStopWords(app) + assert text_norm == normalizer.normalize(text, 'und') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e393a7028c..ab41beb7b6 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -24,7 +24,7 @@ from rero_ils.modules.patrons.api import Patron from rero_ils.modules.utils import add_years, extracted_data_from_ref, \ get_endpoint_configuration, get_schema_for_resource, read_json_record -from rero_ils.utils import get_current_language, language_iso639_2to1, \ +from rero_ils.utils import get_current_language, language_iso639_2to1, language_mapping, \ unique_list @@ -108,3 +108,10 @@ def test_language_iso639_2to1(app): assert language_iso639_2to1('ita') == 'it' # default language assert language_iso639_2to1('rus') == 'en' + + +def test_language_mapping(app): + """Test language mapping.""" + assert 'fre' == language_mapping('fre') + assert 'fre' == language_mapping('fra') + assert 'dut' == language_mapping('dum')