data: implement stop words

* Adds a stop word based normalizer. * Adds fields on the documents for sorting. * Adds fields on the contributions for sorting. * Adds elasticsearch configuration for sorting on resources. * Closes rero#2396. Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
Garfield-fr · Sep 28, 2021 · d7f0e08 · d7f0e08
1 parent a35fc49
commit d7f0e08
Show file tree

Hide file tree

Showing 8 changed files with 344 additions and 52 deletions.
diff --git a/rero_ils/config.py b/rero_ils/config.py
@@ -2205,6 +2205,58 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict(
     query='bestmatch', noquery='name')
 
+# ------ COLLECTIONS SORT
+RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
+    fields=['start_date', 'title_sort'], title='Start date and title',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
+    fields=['title_sort'], title='title',
+    default_order='asc'
+)
+RECORDS_REST_DEFAULT_SORT['collections'] = dict(
+    query='bestmatch', noquery='start_date')
+
+# ------ CONTRIBUTIONS SORT
+RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict(
+    fields=[
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort',
+        'gnd_authorized_access_point_sort',
+    ],
+    title='Collection french name',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict(
+    fields=[
+        'gnd_authorized_access_point_sort',
+        'idref_authorized_access_point_sort',
+        'rero_authorized_access_point_sort'
+    ],
+    title='Collection german name',
+    default_order='asc'
+)
+
+# ------ DOCUMENTS SORT
+RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict(
+    fields=['sort_title'], title='Document title',
+    default_order='asc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict(
+    fields=[
+        '-provision_activity_end_date',
+        '-provision_activity_start_date'
+    ],
+    title='Document date (newest)',
+    default_order='desc'
+)
+
+RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict(
+    fields=['provision_activity_start_date'], title='Document date (oldest)',
+    default_order='asc'
+)
+
 # ------ HOLDINGS SORT
 RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict(
     fields=['library.pid', 'location.pid'],
@@ -2214,7 +2266,19 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['holdings'] = dict(
     query='bestmatch', noquery='library_location')
 
-# ------ ITEM SORT
+# ------ ITEMS SORT
+RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict(
+    fields=['call_number'], title='Call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict(
+    fields=['second_call_number'], title='Second call Number',
+    default_order='asc'
+)
+RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
+    fields=['issue.expected_date'], title='Issue expected date',
+    default_order='asc'
+)
 RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict(
     fields=['-enumerationAndChronology'], title='Enumeration and Chronology',
     default_order='desc'
@@ -2239,6 +2303,10 @@ def _(x):
     fields=['library_name'], title='Library name',
     default_order='asc'
 )
+RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict(
+    fields=['code'], title='Library code',
+    default_order='asc'
+)
 RECORDS_REST_DEFAULT_SORT['libraries'] = dict(
     query='bestmatch', noquery='name')
 
@@ -2282,19 +2350,6 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['patron_types'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ VENDORS SORT
-RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
-    fields=['vendor_name'], title='Vendor name',
-    default_order='asc'
-)
-RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
-    query='bestmatch', noquery='name')
-
-# ------ ITEMS SORT
-RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
-    fields=['issue.expected_date'], title='Issue expected date',
-    default_order='asc'
-)
 # ------ TEMPLATES SORT
 RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict(
     fields=['name_sort'], title='Template name',
@@ -2303,18 +2358,14 @@ def _(x):
 RECORDS_REST_DEFAULT_SORT['templates'] = dict(
     query='bestmatch', noquery='name')
 
-# ------ COLLECTIONS SORT
-RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
-    fields=['start_date', 'title_sort'], title='Start date and title',
-    default_order='asc'
-)
-RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
-    fields=['title_sort'], title='title',
+# ------ VENDORS SORT
+RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
+    fields=['vendor_name'], title='Vendor name',
     default_order='asc'
 )
+RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
+    query='bestmatch', noquery='name')
 
-RECORDS_REST_DEFAULT_SORT['collections'] = dict(
-    query='bestmatch', noquery='start_date')
 
 # Detailed View Configuration
 # ===========================
@@ -2806,3 +2857,23 @@ def _(x):
 
 # OAuth base template
 OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html'
+
+# STOP WORDS
+# ==========
+# ACTIVATE STOP WORDS NORMALIZATION
+RERO_ILS_STOP_WORDS_ACTIVATE = True
+# PUNCTUATION
+RERO_ILS_STOP_WORDS_PUNCTUATION = [
+    '"', ';', ':', '\\.', '-', '_', '\\?', '\\!', '\\*', '\\+', '\n'
+]
+# STOP WORDS BY LANGUAGE
+# Possibility to add a default configuration with a "default" entry
+RERO_ILS_STOP_WORDS = {
+    'eng': ["a", "an", "the"],
+    'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"],
+    'ger': ["das", "dem", "den", "der", "des", "die"],
+    'ita': [
+        "gli", "i", "il", "l'", "la", "le", "li", "lo", "un", "un'",
+        "una", "uno"],
+    'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"]
+}
diff --git a/rero_ils/es_templates/v7/record.json b/rero_ils/es_templates/v7/record.json
@@ -6,6 +6,11 @@
     "max_result_window": "100000",
     "analysis": {
       "filter": {
+        "edge_ngram_filter": {
+          "type": "edge_ngram",
+          "min_gram": 3,
+          "max_gram": 10
+        },
         "french_elision": {
           "type": "elision",
           "articles_case": true,
@@ -25,6 +30,32 @@
             "puisqu"
           ]
         },
+        "italian_elision": {
+          "type": "elision",
+          "articles": [
+              "c", "l", "all", "dall", "dell",
+              "nell", "sull", "coll", "pell",
+              "gl", "agl", "dagl", "degl", "negl",
+              "sugl", "un", "m", "t", "s", "v", "d"
+          ],
+          "articles_case": true
+        },
+        "english_stop": {
+          "type":       "stop",
+          "stopwords":  "_english_"
+        },
+        "french_stop": {
+          "type":       "stop",
+          "stopwords":  "_french_"
+        },
+        "german_stop": {
+          "type":       "stop",
+          "stopwords":  "_german_"
+        },
+        "italian_stop": {
+          "type":       "stop",
+          "stopwords":  "_italian_"
+        },
         "french_stemmer": {
           "type": "stemmer",
           "language": "light_french"
@@ -65,6 +96,26 @@
             "icu_folding",
             "german_normalization"
           ]
+        },
+        "autocomplete": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "filter": [
+            "lowercase",
+            "icu_normalizer",
+            "icu_folding",
+            "french_elision",
+            "italian_elision",
+            "edge_ngram_filter"
+          ]
+        }
+      },
+      "normalizer": {
+        "sort_normalizer": {
+          "type": "custom",
+          "filter": [
+            "lowercase"
+          ]
         }
       }
     }

diff --git a/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json b/rero_ils/modules/contributions/mappings/v7/contributions/contribution-v0.0.1.json
@@ -29,6 +29,18 @@
         "analyzer": "autocomplete",
         "search_analyzer": "standard"
       },
+      "gnd_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "idref_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
+      "rero_authorized_access_point_sort": {
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
+      },
       "$schema": {
         "type": "keyword"
       },
@@ -100,7 +112,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "gnd_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -203,7 +218,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "idref_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"
@@ -306,7 +324,10 @@
           },
           "authorized_access_point": {
             "type": "text",
-            "copy_to": "autocomplete_name"
+            "copy_to": [
+              "autocomplete_name",
+              "rero_authorized_access_point_sort"
+            ]
           },
           "qualifier": {
             "type": "keyword"

diff --git a/rero_ils/modules/documents/listener.py b/rero_ils/modules/documents/listener.py
@@ -17,6 +17,7 @@
 
 """Signals connector for Document."""
 
+from flask.globals import current_app
 from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13
 
 from .utils import create_contributions, title_format_text_head
@@ -139,17 +140,24 @@ def enrich_document_data(sender, json=None, record=None, index=None,
                         )
             json['title'].append(title)
 
-        json['sort_title'] = title_format_text_head(
+        # sort title
+        sort_title = title_format_text_head(
             json.get('title', []),
             with_subtitle=True
         )
+        language = json.get('language')[0].get('value')
+        if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
+            sort_title = current_app.\
+                extensions['reroils-normalizer-stop-words'].\
+                normalize(sort_title, language)
+        json['sort_title'] = sort_title
         # Local fields in JSON
         local_fields = LocalField.get_local_fields_by_resource(
             'doc', document_pid)
         if local_fields:
             json['local_fields'] = local_fields
-        # index both ISBN 10 and 13 format
 
+        # index both ISBN 10 and 13 format
         def filter_isbn(identified_by):
             """Filter identified_by for type bf:Isbn."""
             return identified_by.get('type') == 'bf:Isbn'
@@ -168,3 +176,17 @@ def filter_isbn(identified_by):
                 isbns.add(to_isbn10(isbn))
         if isbns:
             json['isbn'] = list(isbns)
+
+        # Populate startDate and endDate for use in sorting
+        pub_provisions = [
+            p for p in record.get('provisionActivity', [])
+            if p['type'] == 'bf:Publication'
+        ]
+        pub_provision = next(iter(pub_provisions), None)
+        if pub_provision:
+            if 'startDate' in pub_provision:
+                json['provision_activity_start_date'] = \
+                    pub_provision['startDate']
+            if 'endDate' in pub_provision:
+                json['provision_activity_end_date'] = \
+                    pub_provision['endDate']
diff --git a/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json b/rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json
@@ -1,25 +1,4 @@
 {
-  "settings": {
-    "analysis": {
-      "filter": {
-        "autocomplete_filter": {
-          "type": "edge_ngram",
-          "min_gram": 1,
-          "max_gram": 20
-        }
-      },
-      "analyzer": {
-        "autocomplete": {
-          "type": "custom",
-          "tokenizer": "standard",
-          "filter": [
-            "lowercase",
-            "autocomplete_filter"
-          ]
-        }
-      }
-    }
-  },
   "mappings": {
     "date_detection": false,
     "numeric_detection": false,
@@ -154,11 +133,11 @@
       },
       "autocomplete_title": {
         "type": "text",
-        "analyzer": "autocomplete",
-        "search_analyzer": "standard"
+        "analyzer": "autocomplete"
       },
       "sort_title": {
-        "type": "keyword"
+        "type": "keyword",
+        "normalizer": "sort_normalizer"
       },
       "responsibilityStatement": {
         "type": "object",
@@ -368,6 +347,12 @@
           }
         }
       },
+      "provision_activity_start_date": {
+        "type": "integer"
+      },
+      "provision_activity_end_date": {
+        "type": "integer"
+      },
       "provisionActivity": {
         "type": "object",
         "properties": {