From 564e13fbcb6879dbb5337065cc5f108b480f8f3c Mon Sep 17 00:00:00 2001
From: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
Date: Wed, 15 Sep 2021 15:35:15 +0200
Subject: [PATCH] data: implement stop words
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Warning: reindexing documents, contributions and patrons is mandatory.

* Adds a stop word based normalizer.
* Adds fields on the documents for sorting.
* Adds fields on the contributions for sorting.
* Adds elasticsearch configuration for sorting on resources.
* Adds language mapping for code conversion.
* Fixes generating the document title on _text.
* Closes #2396.

Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
Co-Authored-by: Johnny Mariéthoz <johnny.mariethoz@rero.ch>
---
 rero_ils/config.py                            | 138 +++++++++++++++---
 rero_ils/es_templates/v7/record.json          |  51 +++++++
 .../v7/contributions/contribution-v0.0.1.json |  27 +++-
 rero_ils/modules/documents/listener.py        |  24 ++-
 .../v7/documents/document-v0.0.1.json         |  33 ++---
 rero_ils/modules/documents/utils.py           |   4 +
 rero_ils/modules/ext.py                       |   2 +
 rero_ils/modules/normalizer_stop_words.py     |  70 +++++++++
 .../mappings/v7/patrons/patron-v0.0.1.json    |  16 +-
 rero_ils/utils.py                             |  10 ++
 tests/api/test_external_services.py           |   2 +
 tests/unit/test_documents_utils.py            |  77 ++++++++++
 tests/unit/test_normalizer_stop_words.py      |  67 +++++++++
 tests/unit/test_utils.py                      |   8 +-
 14 files changed, 475 insertions(+), 54 deletions(-)
 create mode 100644 rero_ils/modules/normalizer_stop_words.py
 create mode 100644 tests/unit/test_documents_utils.py
 create mode 100644 tests/unit/test_normalizer_stop_words.py

diff --git a/rero_ils/config.py b/rero_ils/config.py
index 3b83308dec..c78142d2bc 100644
--- a/rero_ils/config.py
+++ b/rero_ils/config.py
@@ -2205,6 +2205,54 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict(
     query='bestmatch', noquery='name')
 
+# ------ COLLECTIONS SORT
+RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
+    fields=['start_date', 'title_sort'], title='Start date and title',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
+    fields=['title_sort'], title='title',
+    default_order='asc'
+)
+RECORDS_REST_DEFAULT_SORT['collections'] = dict(
+    query='bestmatch', noquery='start_date')
+
+# ------ CONTRIBUTIONS SORT
+RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict(
+    fields=[
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort',
+        'gnd_authorized_access_point_sort',
+    ],
+    title='Collection french name',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict(
+    fields=[
+        'gnd_authorized_access_point_sort',
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort'
+    ],
+    title='Collection german name',
+    default_order='asc'
+)
+
+# ------ DOCUMENTS SORT
+RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict(
+    fields=['sort_title'], title='Document title',
+    default_order='asc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict(
+    fields=['-sort_date_new'], title='Document date (newest)',
+    default_order='desc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict(
+    fields=['sort_date_old'], title='Document date (oldest)',
+    default_order='asc'
+)
+
 # ------ HOLDINGS SORT
 RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict(
     fields=['library.pid', 'location.pid'],
@@ -2214,7 +2262,24 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['holdings'] = dict(
     query='bestmatch', noquery='library_location')
 
-# ------ ITEM SORT
+# ------ ITEMS SORT
+RECORDS_REST_SORT_OPTIONS['items']['barcode'] = dict(
+    fields=['barcode'], title='Barcode',
+    default_order='asc'
+)
+
+RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict(
+    fields=['call_number'], title='Call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict(
+    fields=['second_call_number'], title='Second call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
+    fields=['issue.expected_date'], title='Issue expected date',
+    default_order='asc'
+)
 RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict(
     fields=['-enumerationAndChronology'], title='Enumeration and Chronology',
     default_order='desc'
@@ -2239,6 +2304,10 @@ def _(x):
     fields=['library_name'], title='Library name',
     default_order='asc'
 )
+RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict(
+    fields=['code'], title='Library code',
+    default_order='asc'
+)
 RECORDS_REST_DEFAULT_SORT['libraries'] = dict(
     query='bestmatch', noquery='name')
 
@@ -2282,19 +2351,6 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['patron_types'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ VENDORS SORT
-RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
-    fields=['vendor_name'], title='Vendor name',
-    default_order='asc'
-)
-RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
-    query='bestmatch', noquery='name')
-
-# ------ ITEMS SORT
-RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
-    fields=['issue.expected_date'], title='Issue expected date',
-    default_order='asc'
-)
 # ------ TEMPLATES SORT
 RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict(
     fields=['name_sort'], title='Template name',
@@ -2303,18 +2359,14 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['templates'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ COLLECTIONS SORT
-RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
-    fields=['start_date', 'title_sort'], title='Start date and title',
-    default_order='asc'
-)
-RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
-    fields=['title_sort'], title='title',
+# ------ VENDORS SORT
+RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
+    fields=['vendor_name'], title='Vendor name',
     default_order='asc'
 )
+RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
+    query='bestmatch', noquery='name')
 
-RECORDS_REST_DEFAULT_SORT['collections'] = dict(
-    query='bestmatch', noquery='start_date')
 
 # Detailed View Configuration
 # ===========================
@@ -2805,3 +2857,43 @@ def _(x):
 
 # OAuth base template
 OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html'
+
+# STOP WORDS
+# Disregarded articles for sorting processes
+# ==========
+# ACTIVATE STOP WORDS NORMALIZATION
+RERO_ILS_STOP_WORDS_ACTIVATE = True
+# PUNCTUATION
+RERO_ILS_STOP_WORDS_PUNCTUATION = [
+    r'\[', r'\]', '"', ',', ';', ':', r'\.', '_',
+    r'\?', r'\!', r'\*', r'\+', '\n'
+]
+# STOP WORDS BY LANGUAGE
+# Possibility to add a default configuration with a "default" entry.
+# This default configuration will be used if the language is not present
+RERO_ILS_STOP_WORDS = {
+    'dan': ["de", "den", "det", "en", "et"],
+    'dut': [
+        "d'", "de", "den", "der", "des", "het", "'s", "'t", "een",
+        "eene", "eener", "eens", "ene", "'n"],
+    'eng': ["a", "an", "the"],
+    'epo': ["la", "l'", "unu"],
+    'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"],
+    'ger': [
+        "das", "dem", "den", "der", "des", "die",
+        "ein", "eine", "einem", "einen", "einer", "eines"],
+    'hun': [ "a", "az", "egy"],
+    'ita': [
+        "gli", "i", "il", "l'", "la", "le", "li", "lo",
+        "un", "un'", "una", "uno"],
+    'nor': ["de", "dei", "den", "det", "ei", "en", "et"],
+    'por': ["a", "as", "o", "os", "um", "uma", "umas", "uns"],
+    'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"],
+    'swe': ["de", "den", "det", "en", "ett"]
+}
+
+# LANGUAGE MAPPING
+# ================
+RERO_ILS_LANGUAGE_MAPPING = {
+    'dum': 'dut'  # neerlandais
+}
diff --git a/rero_ils/es_templates/v7/record.json b/rero_ils/es_templates/v7/record.json
index ae0cd26838..ba0a909ac8 100644
--- a/rero_ils/es_templates/v7/record.json
+++ b/rero_ils/es_templates/v7/record.json
@@ -6,6 +6,11 @@
     "max_result_window": "100000",
     "analysis": {
       "filter": {
+        "edge_ngram_filter": {
+          "type": "edge_ngram",
+          "min_gram": 3,
+          "max_gram": 10
+        },
         "french_elision": {
           "type": "elision",
           "articles_case": true,
@@ -25,6 +30,32 @@
             "puisqu"
           ]
         },
+        "italian_elision": {
+          "type": "elision",
+          "articles": [
+              "c", "l", "all", "dall", "dell",
+              "nell", "sull", "coll", "pell",
+              "gl", "agl", "dagl", "degl", "negl",
+              "sugl", "un", "m", "t", "s", "v", "d"
+          ],
+          "articles_case": true
+        },
+        "english_stop": {
+          "type":       "stop",
+          "stopwords":  "_english_"
+        },
+        "french_stop": {
+          "type":       "stop",
+          "stopwords":  "_french_"
+        },
+        "german_stop": {
+          "type":       "stop",
+          "stopwords":  "_german_"
+        },
+        "italian_stop": {
+          "type":       "stop",
+          "stopwords":  "_italian_"
+        },
         "french_stemmer": {
           "type": "stemmer",
           "language": "light_french"
@@ -65,6 +96,26 @@
             "icu_folding",
             "german_normalization"
           ]
+        },
+        "autocomplete": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "icu_normalizer",
+            "icu_folding",
+            "french_elision",
+            "italian_elision",
+            "edge_ngram_filter"
+          ]
+        }
+      },
+      "normalizer": {
+        "sort_normalizer": {
+          "type": "custom",
+          "filter": [
+            "lowercase"
+          ]
         }
       }
     }
diff --git a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json
index ac1bf5afc1..3bc1b25499 100644
--- a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json
+++ b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json
@@ -29,6 +29,18 @@
         "analyzer": "autocomplete",
         "search_analyzer": "standard"
       },
+      "gnd_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "idref_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "rero_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
       "$schema": {
         "type": "keyword"
       },
@@ -100,7 +112,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "gnd_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -203,7 +218,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "idref_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -306,7 +324,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "rero_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py
index 0c78e6cd71..83d5d63d6d 100644
--- a/rero_ils/modules/documents/listener.py
+++ b/rero_ils/modules/documents/listener.py
@@ -17,6 +17,7 @@
 
 """Signals connector for Document."""
 
+from flask.globals import current_app
 from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13
 
 from .utils import create_contributions, title_format_text_head
@@ -26,6 +27,7 @@
 from ..items.models import ItemNoteTypes
 from ..local_fields.api import LocalField
 from ..utils import extracted_data_from_ref
+from ...utils import language_mapping
 
 
 def enrich_document_data(sender, json=None, record=None, index=None,
@@ -139,17 +141,24 @@ def enrich_document_data(sender, json=None, record=None, index=None,
                         )
             json['title'].append(title)
 
-        json['sort_title'] = title_format_text_head(
+        # sort title
+        sort_title = title_format_text_head(
             json.get('title', []),
             with_subtitle=True
         )
+        language = language_mapping(json.get('language')[0].get('value'))
+        if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
+            sort_title = current_app.\
+                extensions['reroils-normalizer-stop-words'].\
+                normalize(sort_title, language)
+        json['sort_title'] = sort_title
         # Local fields in JSON
         local_fields = LocalField.get_local_fields_by_resource(
             'doc', document_pid)
         if local_fields:
             json['local_fields'] = local_fields
-        # index both ISBN 10 and 13 format
 
+        # index both ISBN 10 and 13 format
         def filter_isbn(identified_by):
             """Filter identified_by for type bf:Isbn."""
             return identified_by.get('type') == 'bf:Isbn'
@@ -168,3 +177,14 @@ def filter_isbn(identified_by):
                 isbns.add(to_isbn10(isbn))
         if isbns:
             json['isbn'] = list(isbns)
+
+        # Populate sort date new and old for use in sorting
+        pub_provisions = [
+            p for p in record.get('provisionActivity', [])
+            if p['type'] == 'bf:Publication'
+        ]
+        pub_provision = next(iter(pub_provisions), None)
+        if pub_provision:
+            json['sort_date_new'] = \
+                pub_provision.get('endDate', pub_provision.get('startDate'))
+            json['sort_date_old'] = pub_provision.get('startDate')
diff --git a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json
index 22d8d2a1f9..cc2c6b99b5 100644
--- a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json
+++ b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json
@@ -1,25 +1,4 @@
 {
-  "settings": {
-    "analysis": {
-      "filter": {
-        "autocomplete_filter": {
-          "type": "edge_ngram",
-          "min_gram": 1,
-          "max_gram": 20
-        }
-      },
-      "analyzer": {
-        "autocomplete": {
-          "type": "custom",
-          "tokenizer": "standard",
-          "filter": [
-            "lowercase",
-            "autocomplete_filter"
-          ]
-        }
-      }
-    }
-  },
   "mappings": {
     "date_detection": false,
     "numeric_detection": false,
@@ -154,11 +133,11 @@
       },
       "autocomplete_title": {
         "type": "text",
-        "analyzer": "autocomplete",
-        "search_analyzer": "standard"
+        "analyzer": "autocomplete"
       },
       "sort_title": {
-        "type": "keyword"
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
       },
       "responsibilityStatement": {
         "type": "object",
@@ -368,6 +347,12 @@
           }
         }
       },
+      "sort_date_new": {
+        "type": "integer"
+      },
+      "sort_date_old": {
+        "type": "integer"
+      },
       "provisionActivity": {
         "type": "object",
         "properties": {
diff --git a/rero_ils/modules/documents/utils.py b/rero_ils/modules/documents/utils.py
index ae68270ab6..866a263db9 100644
--- a/rero_ils/modules/documents/utils.py
+++ b/rero_ils/modules/documents/utils.py
@@ -266,6 +266,10 @@ def title_format_text_head(titles, responsabilities=None, with_subtitle=True):
                     language = title_text.get('language')
                     if display_alternate_graphic_first(language):
                         head_titles.append(title_text.get('value'))
+                # If I don't have a title available,
+                # I get the last value of the table
+                if len(head_titles) == 0:
+                    head_titles.append(title_texts[-1].get('value'))
         elif title.get('type') == 'bf:ParallelTitle':
             parallel_title_texts = title_format_text(
                 title=title, with_subtitle=with_subtitle)
diff --git a/rero_ils/modules/ext.py b/rero_ils/modules/ext.py
index 93dc58c0ba..756d01185f 100644
--- a/rero_ils/modules/ext.py
+++ b/rero_ils/modules/ext.py
@@ -47,6 +47,7 @@
 from .items.listener import enrich_item_data
 from .loans.listener import enrich_loan_data, listener_loan_state_changed
 from .locations.listener import enrich_location_data
+from .normalizer_stop_words import NormalizerStopWords
 from .notifications.listener import enrich_notification_data
 from .patron_transaction_events.listener import \
     enrich_patron_transaction_event_data
@@ -95,6 +96,7 @@ def init_app(self, app):
         """Flask application initialization."""
         Bootstrap(app)
         Wiki(app)
+        NormalizerStopWords(app)
         self.init_config(app)
         app.extensions['rero-ils'] = self
         self.register_import_api_blueprint(app)
diff --git a/rero_ils/modules/normalizer_stop_words.py b/rero_ils/modules/normalizer_stop_words.py
new file mode 100644
index 0000000000..7070587414
--- /dev/null
+++ b/rero_ils/modules/normalizer_stop_words.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""Normalized sort for rero-ils."""
+
+import re
+
+
+class NormalizerStopWords():
+    """Normalizer Stop words."""
+
+    stop_words_punctuation = []
+    stop_words_regex = {}
+
+    def __init__(self, app=None):
+        """Init."""
+        self.app = app
+        if app is not None:
+            self.init_app(app)
+
+    def init_app(self, app):
+        """Flask application initialization."""
+        if app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
+            self.init_config(app)
+            app.extensions['reroils-normalizer-stop-words'] = self
+
+    def init_config(self, app):
+        """Initialize configuration."""
+        punc = app.config.get('RERO_ILS_STOP_WORDS_PUNCTUATION', [])
+        self.stop_words_punctuation = '|'.join(punc)
+        stop_words = app.config.get('RERO_ILS_STOP_WORDS', {})
+        if stop_words:
+            # Generating a regex per language
+            for lang, words in stop_words.items():
+                self.stop_words_regex[lang] = \
+                     r'\b(' + r'|'.join(words) + r')\b\s*'
+
+    def normalize(self, text, language=None):
+        """Normalize.
+
+        :param text: Text to be normalized
+        :param language: Language of the text
+        :returns: Normalized text
+        """
+        word_regex = self.stop_words_regex.get(
+            language,
+            self.stop_words_regex.get('default')
+        )
+        if word_regex:
+            compiled = re.compile(fr'{word_regex}', re.IGNORECASE)
+            text = compiled.sub('', text)
+        if self.stop_words_punctuation:
+            compiled = re.compile(
+                fr'{self.stop_words_punctuation}', re.IGNORECASE)
+            text = compiled.sub('', text)
+        return re.sub(r'\s+', ' ', text).strip()
diff --git a/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json b/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json
index 4b808bcf0e..baa468247c 100644
--- a/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json
+++ b/rero_ils/modules/patrons/mappings/v7/patrons/patron-v0.0.1.json
@@ -1,4 +1,16 @@
 {
+  "settings": {
+    "analysis": {
+      "normalizer": {
+        "name_normalizer": {
+          "type": "custom",
+          "filter": [
+            "lowercase"
+          ]
+        }
+      }
+    }
+  },
   "mappings": {
     "date_detection": false,
     "numeric_detection": false,
@@ -14,6 +26,7 @@
         "copy_to": "first_name_sort"
       },
       "first_name_sort": {
+        "normalizer": "name_normalizer",
         "type": "keyword"
       },
       "last_name": {
@@ -21,6 +34,7 @@
         "copy_to": "last_name_sort"
       },
       "last_name_sort": {
+        "normalizer": "name_normalizer",
         "type": "keyword"
       },
       "gender": {
@@ -228,4 +242,4 @@
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/rero_ils/utils.py b/rero_ils/utils.py
index 4ff15d3ff8..b2b147bfdf 100644
--- a/rero_ils/utils.py
+++ b/rero_ils/utils.py
@@ -127,3 +127,13 @@ def language_iso639_2to1(lang):
         return default_ln
     supported_languages = [v[0] for v in current_i18n.get_languages()]
     return ln if ln in supported_languages else default_ln
+
+
+def language_mapping(lang):
+    """Language mapping.
+
+    :param lang: bibliographic language code
+    :returns: language mapping
+    """
+    return current_app.config.get('RERO_ILS_LANGUAGE_MAPPING', {})\
+        .get(lang, lang)
diff --git a/tests/api/test_external_services.py b/tests/api/test_external_services.py
index be1227eea5..caf43e6654 100644
--- a/tests/api/test_external_services.py
+++ b/tests/api/test_external_services.py
@@ -45,6 +45,8 @@ def clean_authorized_access_point(data):
             contribution['agent'] = agent
             contributions.append(contribution)
 
+        data.pop('sort_date_new', None)
+        data.pop('sort_date_old', None)
         data.pop('sort_title', None)
         data.pop('isbn', None)
         return data
diff --git a/tests/unit/test_documents_utils.py b/tests/unit/test_documents_utils.py
new file mode 100644
index 0000000000..7d0e5680b9
--- /dev/null
+++ b/tests/unit/test_documents_utils.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""Document utils tests."""
+
+from __future__ import absolute_import, print_function
+
+from rero_ils.modules.documents.utils import title_format_text_head
+
+
+def test_title_format_text_head():
+    """Test title format text head."""
+    data = [{
+        "mainTitle": [
+            {
+                "value": "Dingding lixianji"
+            },
+            {
+                "value": "\u4e01\u4e01\u5386\u9669\u8bb0",
+                "language": "und-hani"
+            }
+        ],
+        "type": "bf:Title"
+    }]
+    assert "\u4e01\u4e01\u5386\u9669\u8bb0" == title_format_text_head(data)
+
+    data = [{
+        "mainTitle": [
+            {
+                "value": "Die russischen orthodoxen Bischöfe von 1893",
+            }
+        ],
+        "subtitle": [
+            {
+                "value": "Bio-Bibliographie"
+            }
+        ],
+        "type": "bf:Title"
+      }
+    ]
+    assert "Die russischen orthodoxen Bischöfe von 1893 " \
+           ": Bio-Bibliographie" == title_format_text_head(data)
+
+    data = [{
+        "mainTitle": [
+            {
+                "value": "Die russischen orthodoxen Bischöfe von 1893",
+            },
+            {
+                "value": "The Russian Orthodox Bishops of 1893",
+                "language": "eng"
+            }
+        ],
+        "subtitle": [
+            {
+                "value": "Bio-Bibliographie"
+            }
+        ],
+        "type": "bf:Title"
+      }
+    ]
+    assert "The Russian Orthodox Bishops of 1893" == \
+        title_format_text_head(data)
diff --git a/tests/unit/test_normalizer_stop_words.py b/tests/unit/test_normalizer_stop_words.py
new file mode 100644
index 0000000000..f75d205a67
--- /dev/null
+++ b/tests/unit/test_normalizer_stop_words.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+#
+# RERO ILS
+# Copyright (C) 2021 RERO
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""Normalizer stop words tests."""
+
+from rero_ils.modules.normalizer_stop_words import NormalizerStopWords
+
+
+def test_normalize(app):
+    """Test stop words normalize."""
+    # ---- The string is not analyzed
+    app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = False
+    normalizer = NormalizerStopWords(app)
+    text = "L'été a été très chaud."
+    assert text == normalizer.normalize(text)
+
+    # ---- The string is analyzed
+    app.config['RERO_ILS_STOP_WORDS_ACTIVATE'] = True
+    app.config['RERO_ILS_STOP_WORDS_PUNCTUATION'] = [
+        '"', ',', ';', ':', r'\.', '_', r'\?', r'\!', r'\*', r'\+', '\n'
+    ]
+    normalizer = NormalizerStopWords(app)
+    text = "L'été a été très chaud."
+    text_norm = "L'été a été très chaud"
+    # The language is not defined. Removal of punctuation only.
+    assert text_norm == normalizer.normalize(text)
+
+    # Deleting words for the defined language.
+    text_norm = "été a été très chaud"
+    app.config['RERO_ILS_STOP_WORDS'] = {
+        'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"]
+    }
+    assert text_norm == normalizer.normalize(text, 'fre')
+
+    text = 'Journal des tribunaux : jurisprudence fédérale. ' \
+        '4, Droit pénal et procédure pénale'
+    text_norm = 'Journal tribunaux jurisprudence fédérale ' \
+        '4 Droit pénal et procédure pénale'
+    assert text_norm == normalizer.normalize(text, 'fre')
+
+    # The language was not found in the definition of stop words.
+    text = "He plays this musical phrase quite well."
+    text_norm = "He plays this musical phrase quite well"
+    assert text_norm == normalizer.normalize(text, 'eng')
+
+    # Deleting words with the default definition.
+    text = "L'été a été très chaud."
+    text_norm = "été a été chaud"
+    app.config['RERO_ILS_STOP_WORDS'] = {
+        'default': ["l'", "très"]
+    }
+    normalizer = NormalizerStopWords(app)
+    assert text_norm == normalizer.normalize(text, 'und')
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 82d4c335ad..21362edfe0 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -25,7 +25,7 @@
 from rero_ils.modules.utils import add_years, extracted_data_from_ref, \
     get_endpoint_configuration, get_schema_for_resource, read_json_record
 from rero_ils.utils import get_current_language, language_iso639_2to1, \
-    unique_list
+    language_mapping, unique_list
 
 
 def test_unique_list():
@@ -110,3 +110,9 @@ def test_language_iso639_2to1(app):
     assert language_iso639_2to1('ita') == 'it'
     # default language
     assert language_iso639_2to1('rus') == 'en'
+
+
+def test_language_mapping(app):
+    """Test language mapping."""
+    assert 'fre' == language_mapping('fre')
+    assert 'dut' == language_mapping('dum')