From 564e13fbcb6879dbb5337065cc5f108b480f8f3c Mon Sep 17 00:00:00 2001 From: Bertrand Zuchuat Date: Wed, 15 Sep 2021 15:35:15 +0200 Subject: [PATCH] data: implement stop words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Warning: reindexing documents, contributions and patrons is mandatory. * Adds a stop word based normalizer. * Adds fields on the documents for sorting. * Adds fields on the contributions for sorting. * Adds elasticsearch configuration for sorting on resources. * Adds language mapping for code conversion. * Fixes generating the document title on _text. * Closes #2396. Co-Authored-by: Bertrand Zuchuat Co-Authored-by: Johnny Mariéthoz --- rero_ils/config.py | 138 +++++++++++++++--- rero_ils/es_templates/v7/record.json | 51 +++++++ .../v7/contributions/contribution-v0.0.1.json | 27 +++- rero_ils/modules/documents/listener.py | 24 ++- .../v7/documents/document-v0.0.1.json | 33 ++--- rero_ils/modules/documents/utils.py | 4 + rero_ils/modules/ext.py | 2 + rero_ils/modules/normalizer_stop_words.py | 70 +++++++++ .../mappings/v7/patrons/patron-v0.0.1.json | 16 +- rero_ils/utils.py | 10 ++ tests/api/test_external_services.py | 2 + tests/unit/test_documents_utils.py | 77 ++++++++++ tests/unit/test_normalizer_stop_words.py | 67 +++++++++ tests/unit/test_utils.py | 8 +- 14 files changed, 475 insertions(+), 54 deletions(-) create mode 100644 rero_ils/modules/normalizer_stop_words.py create mode 100644 tests/unit/test_documents_utils.py create mode 100644 tests/unit/test_normalizer_stop_words.py diff --git a/rero_ils/config.py b/rero_ils/config.py index 3b83308dec..c78142d2bc 100644 --- a/rero_ils/config.py +++ b/rero_ils/config.py @@ -2205,6 +2205,54 @@ def _(x): RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict( query='bestmatch', noquery='name') +# ------ COLLECTIONS SORT +RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict( + fields=['start_date', 'title_sort'], title='Start date and title', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict( + fields=['title_sort'], title='title', + default_order='asc' +) +RECORDS_REST_DEFAULT_SORT['collections'] = dict( + query='bestmatch', noquery='start_date') + +# ------ CONTRIBUTIONS SORT +RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict( + fields=[ + 'idref_authorized_access_point_sort', + 'rero_authorized_access_point_sort', + 'gnd_authorized_access_point_sort', + ], + title='Collection french name', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict( + fields=[ + 'gnd_authorized_access_point_sort', + 'idref_authorized_access_point_sort', + 'rero_authorized_access_point_sort' + ], + title='Collection german name', + default_order='asc' +) + +# ------ DOCUMENTS SORT +RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict( + fields=['sort_title'], title='Document title', + default_order='asc' +) + +RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict( + fields=['-sort_date_new'], title='Document date (newest)', + default_order='desc' +) + +RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict( + fields=['sort_date_old'], title='Document date (oldest)', + default_order='asc' +) + # ------ HOLDINGS SORT RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict( fields=['library.pid', 'location.pid'], @@ -2214,7 +2262,24 @@ def _(x): RECORDS_REST_DEFAULT_SORT['holdings'] = dict( query='bestmatch', noquery='library_location') -# ------ ITEM SORT +# ------ ITEMS SORT +RECORDS_REST_SORT_OPTIONS['items']['barcode'] = dict( + fields=['barcode'], title='Barcode', + default_order='asc' +) + +RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict( + fields=['call_number'], title='Call Number', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict( + fields=['second_call_number'], title='Second call Number', + default_order='asc' +) +RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict( + fields=['issue.expected_date'], title='Issue expected date', + default_order='asc' +) RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict( fields=['-enumerationAndChronology'], title='Enumeration and Chronology', default_order='desc' @@ -2239,6 +2304,10 @@ def _(x): fields=['library_name'], title='Library name', default_order='asc' ) +RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict( + fields=['code'], title='Library code', + default_order='asc' +) RECORDS_REST_DEFAULT_SORT['libraries'] = dict( query='bestmatch', noquery='name') @@ -2282,19 +2351,6 @@ def _(x): RECORDS_REST_DEFAULT_SORT['patron_types'] = dict( query='bestmatch', noquery='name') -# ------ VENDORS SORT -RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict( - fields=['vendor_name'], title='Vendor name', - default_order='asc' -) -RECORDS_REST_DEFAULT_SORT['vendors'] = dict( - query='bestmatch', noquery='name') - -# ------ ITEMS SORT -RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict( - fields=['issue.expected_date'], title='Issue expected date', - default_order='asc' -) # ------ TEMPLATES SORT RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict( fields=['name_sort'], title='Template name', @@ -2303,18 +2359,14 @@ def _(x): RECORDS_REST_DEFAULT_SORT['templates'] = dict( query='bestmatch', noquery='name') -# ------ COLLECTIONS SORT -RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict( - fields=['start_date', 'title_sort'], title='Start date and title', - default_order='asc' -) -RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict( - fields=['title_sort'], title='title', +# ------ VENDORS SORT +RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict( + fields=['vendor_name'], title='Vendor name', default_order='asc' ) +RECORDS_REST_DEFAULT_SORT['vendors'] = dict( + query='bestmatch', noquery='name') -RECORDS_REST_DEFAULT_SORT['collections'] = dict( - query='bestmatch', noquery='start_date') # Detailed View Configuration # =========================== @@ -2805,3 +2857,43 @@ def _(x): # OAuth base template OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html' + +# STOP WORDS +# Disregarded articles for sorting processes +# ========== +# ACTIVATE STOP WORDS NORMALIZATION +RERO_ILS_STOP_WORDS_ACTIVATE = True +# PUNCTUATION +RERO_ILS_STOP_WORDS_PUNCTUATION = [ + r'\[', r'\]', '"', ',', ';', ':', r'\.', '_', + r'\?', r'\!', r'\*', r'\+', '\n' +] +# STOP WORDS BY LANGUAGE +# Possibility to add a default configuration with a "default" entry. +# This default configuration will be used if the language is not present +RERO_ILS_STOP_WORDS = { + 'dan': ["de", "den", "det", "en", "et"], + 'dut': [ + "d'", "de", "den", "der", "des", "het", "'s", "'t", "een", + "eene", "eener", "eens", "ene", "'n"], + 'eng': ["a", "an", "the"], + 'epo': ["la", "l'", "unu"], + 'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"], + 'ger': [ + "das", "dem", "den", "der", "des", "die", + "ein", "eine", "einem", "einen", "einer", "eines"], + 'hun': [ "a", "az", "egy"], + 'ita': [ + "gli", "i", "il", "l'", "la", "le", "li", "lo", + "un", "un'", "una", "uno"], + 'nor': ["de", "dei", "den", "det", "ei", "en", "et"], + 'por': ["a", "as", "o", "os", "um", "uma", "umas", "uns"], + 'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"], + 'swe': ["de", "den", "det", "en", "ett"] +} + +# LANGUAGE MAPPING +# ================ +RERO_ILS_LANGUAGE_MAPPING = { + 'dum': 'dut' # neerlandais +} diff --git a/rero_ils/es_templates/v7/record.json b/rero_ils/es_templates/v7/record.json index ae0cd26838..ba0a909ac8 100644 --- a/rero_ils/es_templates/v7/record.json +++ b/rero_ils/es_templates/v7/record.json @@ -6,6 +6,11 @@ "max_result_window": "100000", "analysis": { "filter": { + "edge_ngram_filter": { + "type": "edge_ngram", + "min_gram": 3, + "max_gram": 10 + }, "french_elision": { "type": "elision", "articles_case": true, @@ -25,6 +30,32 @@ "puisqu" ] }, + "italian_elision": { + "type": "elision", + "articles": [ + "c", "l", "all", "dall", "dell", + "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", + "sugl", "un", "m", "t", "s", "v", "d" + ], + "articles_case": true + }, + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "french_stop": { + "type": "stop", + "stopwords": "_french_" + }, + "german_stop": { + "type": "stop", + "stopwords": "_german_" + }, + "italian_stop": { + "type": "stop", + "stopwords": "_italian_" + }, "french_stemmer": { "type": "stemmer", "language": "light_french" @@ -65,6 +96,26 @@ "icu_folding", "german_normalization" ] + }, + "autocomplete": { + "type": "custom", + "tokenizer": "standard", + "filter": [ + "lowercase", + "icu_normalizer", + "icu_folding", + "french_elision", + "italian_elision", + "edge_ngram_filter" + ] + } + }, + "normalizer": { + "sort_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] } } } diff --git a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json index ac1bf5afc1..3bc1b25499 100644 --- a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json +++ b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json @@ -29,6 +29,18 @@ "analyzer": "autocomplete", "search_analyzer": "standard" }, + "gnd_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, + "idref_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, + "rero_authorized_access_point_sort": { + "type": "keyword", + "normalizer": "sort_normalizer" + }, "$schema": { "type": "keyword" }, @@ -100,7 +112,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "gnd_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" @@ -203,7 +218,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "idref_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" @@ -306,7 +324,10 @@ }, "authorized_access_point": { "type": "text", - "copy_to": "autocomplete_name" + "copy_to": [ + "autocomplete_name", + "rero_authorized_access_point_sort" + ] }, "qualifier": { "type": "keyword" diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py index 0c78e6cd71..83d5d63d6d 100644 --- a/rero_ils/modules/documents/listener.py +++ b/rero_ils/modules/documents/listener.py @@ -17,6 +17,7 @@ """Signals connector for Document.""" +from flask.globals import current_app from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13 from .utils import create_contributions, title_format_text_head @@ -26,6 +27,7 @@ from ..items.models import ItemNoteTypes from ..local_fields.api import LocalField from ..utils import extracted_data_from_ref +from ...utils import language_mapping def enrich_document_data(sender, json=None, record=None, index=None, @@ -139,17 +141,24 @@ def enrich_document_data(sender, json=None, record=None, index=None, ) json['title'].append(title) - json['sort_title'] = title_format_text_head( + # sort title + sort_title = title_format_text_head( json.get('title', []), with_subtitle=True ) + language = language_mapping(json.get('language')[0].get('value')) + if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False): + sort_title = current_app.\ + extensions['reroils-normalizer-stop-words'].\ + normalize(sort_title, language) + json['sort_title'] = sort_title # Local fields in JSON local_fields = LocalField.get_local_fields_by_resource( 'doc', document_pid) if local_fields: json['local_fields'] = local_fields - # index both ISBN 10 and 13 format + # index both ISBN 10 and 13 format def filter_isbn(identified_by): """Filter identified_by for type bf:Isbn.""" return identified_by.get('type') == 'bf:Isbn' @@ -168,3 +177,14 @@ def filter_isbn(identified_by): isbns.add(to_isbn10(isbn)) if isbns: json['isbn'] = list(isbns) + + # Populate sort date new and old for use in sorting + pub_provisions = [ + p for p in record.get('provisionActivity', []) + if p['type'] == 'bf:Publication' + ] + pub_provision = next(iter(pub_provisions), None) + if pub_provision: + json['sort_date_new'] = \ + pub_provision.get('endDate', pub_provision.get('startDate')) + json['sort_date_old'] = pub_provision.get('startDate') diff --git a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json index 22d8d2a1f9..cc2c6b99b5 100644 --- a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json +++ b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json @@ -1,25 +1,4 @@ { - "settings": { - "analysis": { - "filter": { - "autocomplete_filter": { - "type": "edge_ngram", - "min_gram": 1, - "max_gram": 20 - } - }, - "analyzer": { - "autocomplete": { - "type": "custom", - "tokenizer": "standard", - "filter": [ - "lowercase", - "autocomplete_filter" - ] - } - } - } - }, "mappings": { "date_detection": false, "numeric_detection": false, @@ -154,11 +133,11 @@ }, "autocomplete_title": { "type": "text", - "analyzer": "autocomplete", - "search_analyzer": "standard" + "analyzer": "autocomplete" }, "sort_title": { - "type": "keyword" + "type": "keyword", + "normalizer": "sort_normalizer" }, "responsibilityStatement": { "type": "object", @@ -368,6 +347,12 @@ } } }, + "sort_date_new": { + "type": "integer" + }, + "sort_date_old": { + "type": "integer" + }, "provisionActivity": { "type": "object", "properties": { diff --git a/rero_ils/modules/documents/utils.py b/rero_ils/modules/documents/utils.py index ae68270ab6..866a263db9 100644 --- a/rero_ils/modules/documents/utils.py +++ b/rero_ils/modules/documents/utils.py @@ -266,6 +266,10 @@ def title_format_text_head(titles, responsabilities=None, with_subtitle=True): language = title_text.get('language') if display_alternate_graphic_first(language): head_titles.append(title_text.get('value')) + # If I don't have a title available, + # I get the last value of the table + if len(head_titles) == 0: + head_titles.append(title_texts[-1].get('value')) elif title.get('type') == 'bf:ParallelTitle': parallel_title_texts = title_format_text( title=title, with_subtitle=with_subtitle) diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py index 93dc58c0ba..756d01185f 100644 --- a/rero_ils/modules/ext.py +++ b/rero_ils/modules/ext.py @@ -47,6 +47,7 @@ from .items.listener import enrich_item_data from .loans.listener import enrich_loan_data, listener_loan_state_changed from .locations.listener import enrich_location_data +from .normalizer_stop_words import NormalizerStopWords from .notifications.listener import enrich_notification_data from .patron_transaction_events.listener import \ enrich_patron_transaction_event_data @@ -95,6 +96,7 @@ def init_app(self, app): """Flask application initialization.""" Bootstrap(app) Wiki(app) + NormalizerStopWords(app) self.init_config(app) app.extensions['rero-ils'] = self self.register_import_api_blueprint(app) diff --git a/rero_ils/modules/normalizer_stop_words.py b/rero_ils/modules/normalizer_stop_words.py new file mode 100644 index 0000000000..7070587414 --- /dev/null +++ b/rero_ils/modules/normalizer_stop_words.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Normalized sort for rero-ils.""" + +import re + + +class NormalizerStopWords(): + """Normalizer Stop words.""" + + stop_words_punctuation = [] + stop_words_regex = {} + + def __init__(self, app=None): + """Init.""" + self.app = app + if app is not None: + self.init_app(app) + + def init_app(self, app): + """Flask application initialization.""" + if app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False): + self.init_config(app) + app.extensions['reroils-normalizer-stop-words'] = self + + def init_config(self, app): + """Initialize configuration.""" + punc = app.config.get('RERO_ILS_STOP_WORDS_PUNCTUATION', []) + self.stop_words_punctuation = '|'.join(punc) + stop_words = app.config.get('RERO_ILS_STOP_WORDS', {}) + if stop_words: + # Generating a regex per language + for lang, words in stop_words.items(): + self.stop_words_regex[lang] = \ + r'\b(' + r'|'.join(words) + r')\b\s*' + + def normalize(self, text, language=None): + """Normalize. + + :param text: Text to be normalized + :param language: Language of the text + :returns: Normalized text + """ + word_regex = self.stop_words_regex.get( + language, + self.stop_words_regex.get('default') + ) + if word_regex: + compiled = re.compile(fr'{word_regex}', re.IGNORECASE) + text = compiled.sub('', text) + if self.stop_words_punctuation: + compiled = re.compile( + fr'{self.stop_words_punctuation}', re.IGNORECASE) + text = compiled.sub('', text) + return re.sub(r'\s+', ' ', text).strip() diff --git a/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json b/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json index 4b808bcf0e..baa468247c 100644 --- a/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json +++ b/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json @@ -1,4 +1,16 @@ { + "settings": { + "analysis": { + "normalizer": { + "name_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] + } + } + } + }, "mappings": { "date_detection": false, "numeric_detection": false, @@ -14,6 +26,7 @@ "copy_to": "first_name_sort" }, "first_name_sort": { + "normalizer": "name_normalizer", "type": "keyword" }, "last_name": { @@ -21,6 +34,7 @@ "copy_to": "last_name_sort" }, "last_name_sort": { + "normalizer": "name_normalizer", "type": "keyword" }, "gender": { @@ -228,4 +242,4 @@ } } } -} +} \ No newline at end of file diff --git a/rero_ils/utils.py b/rero_ils/utils.py index 4ff15d3ff8..b2b147bfdf 100644 --- a/rero_ils/utils.py +++ b/rero_ils/utils.py @@ -127,3 +127,13 @@ def language_iso639_2to1(lang): return default_ln supported_languages = [v[0] for v in current_i18n.get_languages()] return ln if ln in supported_languages else default_ln + + +def language_mapping(lang): + """Language mapping. + + :param lang: bibliographic language code + :returns: language mapping + """ + return current_app.config.get('RERO_ILS_LANGUAGE_MAPPING', {})\ + .get(lang, lang) diff --git a/tests/api/test_external_services.py b/tests/api/test_external_services.py index be1227eea5..caf43e6654 100644 --- a/tests/api/test_external_services.py +++ b/tests/api/test_external_services.py @@ -45,6 +45,8 @@ def clean_authorized_access_point(data): contribution['agent'] = agent contributions.append(contribution) + data.pop('sort_date_new', None) + data.pop('sort_date_old', None) data.pop('sort_title', None) data.pop('isbn', None) return data diff --git a/tests/unit/test_documents_utils.py b/tests/unit/test_documents_utils.py new file mode 100644 index 0000000000..7d0e5680b9 --- /dev/null +++ b/tests/unit/test_documents_utils.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Document utils tests.""" + +from __future__ import absolute_import, print_function + +from rero_ils.modules.documents.utils import title_format_text_head + + +def test_title_format_text_head(): + """Test title format text head.""" + data = [{ + "mainTitle": [ + { + "value": "Dingding lixianji" + }, + { + "value": "\u4e01\u4e01\u5386\u9669\u8bb0", + "language": "und-hani" + } + ], + "type": "bf:Title" + }] + assert "\u4e01\u4e01\u5386\u9669\u8bb0" == title_format_text_head(data) + + data = [{ + "mainTitle": [ + { + "value": "Die russischen orthodoxen Bischöfe von 1893", + } + ], + "subtitle": [ + { + "value": "Bio-Bibliographie" + } + ], + "type": "bf:Title" + } + ] + assert "Die russischen orthodoxen Bischöfe von 1893 " \ + ": Bio-Bibliographie" == title_format_text_head(data) + + data = [{ + "mainTitle": [ + { + "value": "Die russischen orthodoxen Bischöfe von 1893", + }, + { + "value": "The Russian Orthodox Bishops of 1893", + "language": "eng" + } + ], + "subtitle": [ + { + "value": "Bio-Bibliographie" + } + ], + "type": "bf:Title" + } + ] + assert "The Russian Orthodox Bishops of 1893" == \ + title_format_text_head(data) diff --git a/tests/unit/test_normalizer_stop_words.py b/tests/unit/test_normalizer_stop_words.py new file mode 100644 index 0000000000..f75d205a67 --- /dev/null +++ b/tests/unit/test_normalizer_stop_words.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# RERO ILS +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Normalizer stop words tests.""" + +from rero_ils.modules.normalizer_stop_words import NormalizerStopWords + + +def test_normalize(app): + """Test stop words normalize.""" + # ---- The string is not analyzed + app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = False + normalizer = NormalizerStopWords(app) + text = "L'été a été très chaud." + assert text == normalizer.normalize(text) + + # ---- The string is analyzed + app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = True + app.config['RERO_ILS_STOP_WORDS_PUNCTUATION'] = [ + '"', ',', ';', ':', r'\.', '_', r'\?', r'\!', r'\*', r'\+', '\n' + ] + normalizer = NormalizerStopWords(app) + text = "L'été a été très chaud." + text_norm = "L'été a été très chaud" + # The language is not defined. Removal of punctuation only. + assert text_norm == normalizer.normalize(text) + + # Deleting words for the defined language. + text_norm = "été a été très chaud" + app.config['RERO_ILS_STOP_WORDS'] = { + 'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"] + } + assert text_norm == normalizer.normalize(text, 'fre') + + text = 'Journal des tribunaux : jurisprudence fédérale. ' \ + '4, Droit pénal et procédure pénale' + text_norm = 'Journal tribunaux jurisprudence fédérale ' \ + '4 Droit pénal et procédure pénale' + assert text_norm == normalizer.normalize(text, 'fre') + + # The language was not found in the definition of stop words. + text = "He plays this musical phrase quite well." + text_norm = "He plays this musical phrase quite well" + assert text_norm == normalizer.normalize(text, 'eng') + + # Deleting words with the default definition. + text = "L'été a été très chaud." + text_norm = "été a été chaud" + app.config['RERO_ILS_STOP_WORDS'] = { + 'default': ["l'", "très"] + } + normalizer = NormalizerStopWords(app) + assert text_norm == normalizer.normalize(text, 'und') diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 82d4c335ad..21362edfe0 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -25,7 +25,7 @@ from rero_ils.modules.utils import add_years, extracted_data_from_ref, \ get_endpoint_configuration, get_schema_for_resource, read_json_record from rero_ils.utils import get_current_language, language_iso639_2to1, \ - unique_list + language_mapping, unique_list def test_unique_list(): @@ -110,3 +110,9 @@ def test_language_iso639_2to1(app): assert language_iso639_2to1('ita') == 'it' # default language assert language_iso639_2to1('rus') == 'en' + + +def test_language_mapping(app): + """Test language mapping.""" + assert 'fre' == language_mapping('fre') + assert 'dut' == language_mapping('dum')