data: implement stop words

Warning: reindexing documents, contributions and patrons is mandatory. * Adds a stop word based normalizer. * Adds fields on the documents for sorting. * Adds fields on the contributions for sorting. * Adds elasticsearch configuration for sorting on resources. * Adds language mapping for code conversion. * Fixes generating the document title on _text. * Closes rero#2396. Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch> Co-Authored-by: Johnny Mariéthoz <johnny.mariethoz@rero.ch>
zannkukai · Oct 13, 2021 · c4bdd9e · c4bdd9e
1 parent b488c63
commit c4bdd9e
Show file tree

Hide file tree

Showing 14 changed files with 475 additions and 54 deletions.
diff --git a/rero_ils/config.py b/rero_ils/config.py
@@ -2205,6 +2205,54 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict(
     query='bestmatch', noquery='name')
 
+# ------ COLLECTIONS SORT
+RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
+    fields=['start_date', 'title_sort'], title='Start date and title',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
+    fields=['title_sort'], title='title',
+    default_order='asc'
+)
+RECORDS_REST_DEFAULT_SORT['collections'] = dict(
+    query='bestmatch', noquery='start_date')
+
+# ------ CONTRIBUTIONS SORT
+RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict(
+    fields=[
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort',
+        'gnd_authorized_access_point_sort',
+    ],
+    title='Collection french name',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict(
+    fields=[
+        'gnd_authorized_access_point_sort',
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort'
+    ],
+    title='Collection german name',
+    default_order='asc'
+)
+
+# ------ DOCUMENTS SORT
+RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict(
+    fields=['sort_title'], title='Document title',
+    default_order='asc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict(
+    fields=['-sort_date_new'], title='Document date (newest)',
+    default_order='desc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict(
+    fields=['sort_date_old'], title='Document date (oldest)',
+    default_order='asc'
+)
+
 # ------ HOLDINGS SORT
 RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict(
     fields=['library.pid', 'location.pid'],
@@ -2214,7 +2262,24 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['holdings'] = dict(
     query='bestmatch', noquery='library_location')
 
-# ------ ITEM SORT
+# ------ ITEMS SORT
+RECORDS_REST_SORT_OPTIONS['items']['barcode'] = dict(
+    fields=['barcode'], title='Barcode',
+    default_order='asc'
+)
+
+RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict(
+    fields=['call_number'], title='Call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict(
+    fields=['second_call_number'], title='Second call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
+    fields=['issue.expected_date'], title='Issue expected date',
+    default_order='asc'
+)
 RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict(
     fields=['-enumerationAndChronology'], title='Enumeration and Chronology',
     default_order='desc'
@@ -2239,6 +2304,10 @@ def _(x):
     fields=['library_name'], title='Library name',
     default_order='asc'
 )
+RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict(
+    fields=['code'], title='Library code',
+    default_order='asc'
+)
 RECORDS_REST_DEFAULT_SORT['libraries'] = dict(
     query='bestmatch', noquery='name')
 
@@ -2282,19 +2351,6 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['patron_types'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ VENDORS SORT
-RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
-    fields=['vendor_name'], title='Vendor name',
-    default_order='asc'
-)
-RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
-    query='bestmatch', noquery='name')
-
-# ------ ITEMS SORT
-RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
-    fields=['issue.expected_date'], title='Issue expected date',
-    default_order='asc'
-)
 # ------ TEMPLATES SORT
 RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict(
     fields=['name_sort'], title='Template name',
@@ -2303,18 +2359,14 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['templates'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ COLLECTIONS SORT
-RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
-    fields=['start_date', 'title_sort'], title='Start date and title',
-    default_order='asc'
-)
-RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
-    fields=['title_sort'], title='title',
+# ------ VENDORS SORT
+RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
+    fields=['vendor_name'], title='Vendor name',
     default_order='asc'
 )
+RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
+    query='bestmatch', noquery='name')
 
-RECORDS_REST_DEFAULT_SORT['collections'] = dict(
-    query='bestmatch', noquery='start_date')
 
 # Detailed View Configuration
 # ===========================
@@ -2805,3 +2857,43 @@ def _(x):
 
 # OAuth base template
 OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html'
+
+# STOP WORDS
+# Disregarded articles for sorting processes
+# ==========
+# ACTIVATE STOP WORDS NORMALIZATION
+RERO_ILS_STOP_WORDS_ACTIVATE = True
+# PUNCTUATION
+RERO_ILS_STOP_WORDS_PUNCTUATION = [
+    r'\[', r'\]', '"', ',', ';', ':', r'\.', '_',
+    r'\?', r'\!', r'\*', r'\+', '\n'
+]
+# STOP WORDS BY LANGUAGE
+# Possibility to add a default configuration with a "default" entry.
+# This default configuration will be used if the language is not present
+RERO_ILS_STOP_WORDS = {
+    'dan': ["de", "den", "det", "en", "et"],
+    'dut': [
+        "d'", "de", "den", "der", "des", "het", "'s", "'t", "een",
+        "eene", "eener", "eens", "ene", "'n"],
+    'eng': ["a", "an", "the"],
+    'epo': ["la", "l'", "unu"],
+    'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"],
+    'ger': [
+        "das", "dem", "den", "der", "des", "die",
+        "ein", "eine", "einem", "einen", "einer", "eines"],
+    'hun': [ "a", "az", "egy"],
+    'ita': [
+        "gli", "i", "il", "l'", "la", "le", "li", "lo",
+        "un", "un'", "una", "uno"],
+    'nor': ["de", "dei", "den", "det", "ei", "en", "et"],
+    'por': ["a", "as", "o", "os", "um", "uma", "umas", "uns"],
+    'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"],
+    'swe': ["de", "den", "det", "en", "ett"]
+}
+
+# LANGUAGE MAPPING
+# ================
+RERO_ILS_LANGUAGE_MAPPING = {
+    'dum': 'dut'  # neerlandais
+}
diff --git a/rero_ils/es_templates/v7/record.json b/rero_ils/es_templates/v7/record.json
@@ -6,6 +6,11 @@
     "max_result_window": "100000",
     "analysis": {
       "filter": {
+        "edge_ngram_filter": {
+          "type": "edge_ngram",
+          "min_gram": 3,
+          "max_gram": 10
+        },
         "french_elision": {
           "type": "elision",
           "articles_case": true,
@@ -25,6 +30,32 @@
             "puisqu"
           ]
         },
+        "italian_elision": {
+          "type": "elision",
+          "articles": [
+              "c", "l", "all", "dall", "dell",
+              "nell", "sull", "coll", "pell",
+              "gl", "agl", "dagl", "degl", "negl",
+              "sugl", "un", "m", "t", "s", "v", "d"
+          ],
+          "articles_case": true
+        },
+        "english_stop": {
+          "type":       "stop",
+          "stopwords":  "_english_"
+        },
+        "french_stop": {
+          "type":       "stop",
+          "stopwords":  "_french_"
+        },
+        "german_stop": {
+          "type":       "stop",
+          "stopwords":  "_german_"
+        },
+        "italian_stop": {
+          "type":       "stop",
+          "stopwords":  "_italian_"
+        },
         "french_stemmer": {
           "type": "stemmer",
           "language": "light_french"
@@ -65,6 +96,26 @@
             "icu_folding",
             "german_normalization"
           ]
+        },
+        "autocomplete": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "icu_normalizer",
+            "icu_folding",
+            "french_elision",
+            "italian_elision",
+            "edge_ngram_filter"
+          ]
+        }
+      },
+      "normalizer": {
+        "sort_normalizer": {
+          "type": "custom",
+          "filter": [
+            "lowercase"
+          ]
         }
       }
     }

diff --git a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json
@@ -29,6 +29,18 @@
         "analyzer": "autocomplete",
         "search_analyzer": "standard"
       },
+      "gnd_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "idref_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "rero_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
       "$schema": {
         "type": "keyword"
       },
@@ -100,7 +112,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "gnd_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -203,7 +218,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "idref_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -306,7 +324,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "rero_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"

diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py
@@ -17,6 +17,7 @@
 
 """Signals connector for Document."""
 
+from flask.globals import current_app
 from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13
 
 from .utils import create_contributions, title_format_text_head
@@ -26,6 +27,7 @@
 from ..items.models import ItemNoteTypes
 from ..local_fields.api import LocalField
 from ..utils import extracted_data_from_ref
+from ...utils import language_mapping
 
 
 def enrich_document_data(sender, json=None, record=None, index=None,
@@ -139,17 +141,24 @@ def enrich_document_data(sender, json=None, record=None, index=None,
                         )
             json['title'].append(title)
 
-        json['sort_title'] = title_format_text_head(
+        # sort title
+        sort_title = title_format_text_head(
             json.get('title', []),
             with_subtitle=True
         )
+        language = language_mapping(json.get('language')[0].get('value'))
+        if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
+            sort_title = current_app.\
+                extensions['reroils-normalizer-stop-words'].\
+                normalize(sort_title, language)
+        json['sort_title'] = sort_title
         # Local fields in JSON
         local_fields = LocalField.get_local_fields_by_resource(
             'doc', document_pid)
         if local_fields:
             json['local_fields'] = local_fields
-        # index both ISBN 10 and 13 format
 
+        # index both ISBN 10 and 13 format
         def filter_isbn(identified_by):
             """Filter identified_by for type bf:Isbn."""
             return identified_by.get('type') == 'bf:Isbn'
@@ -168,3 +177,14 @@ def filter_isbn(identified_by):
                 isbns.add(to_isbn10(isbn))
         if isbns:
             json['isbn'] = list(isbns)
+
+        # Populate sort date new and old for use in sorting
+        pub_provisions = [
+            p for p in record.get('provisionActivity', [])
+            if p['type'] == 'bf:Publication'
+        ]
+        pub_provision = next(iter(pub_provisions), None)
+        if pub_provision:
+            json['sort_date_new'] = \
+                pub_provision.get('endDate', pub_provision.get('startDate'))
+            json['sort_date_old'] = pub_provision.get('startDate')