Skip to content

Commit

Permalink
data: implement stop words
Browse files Browse the repository at this point in the history
Warning: reindexing documents, contributions and patrons is mandatory.

* Adds a stop word based normalizer.
* Adds fields on the documents for sorting.
* Adds fields on the contributions for sorting.
* Adds elasticsearch configuration for sorting on resources.
* Adds language mapping for code conversion.
* Fixes generating the document title on _text.
* Closes rero#2396.

Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
Co-Authored-by: Johnny Mariéthoz <johnny.mariethoz@rero.ch>
  • Loading branch information
Garfield-fr and jma committed Oct 7, 2021
1 parent 5b13154 commit 8980371
Show file tree
Hide file tree
Showing 14 changed files with 475 additions and 54 deletions.
138 changes: 115 additions & 23 deletions rero_ils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,54 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict(
query='bestmatch', noquery='name')

# ------ COLLECTIONS SORT
RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
fields=['start_date', 'title_sort'], title='Start date and title',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
fields=['title_sort'], title='title',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['collections'] = dict(
query='bestmatch', noquery='start_date')

# ------ CONTRIBUTIONS SORT
RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict(
fields=[
'idref_authorized_access_point_sort',
'rero_authorized_access_point_sort',
'gnd_authorized_access_point_sort',
],
title='Collection french name',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict(
fields=[
'gnd_authorized_access_point_sort',
'idref_authorized_access_point_sort',
'rero_authorized_access_point_sort'
],
title='Collection german name',
default_order='asc'
)

# ------ DOCUMENTS SORT
RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict(
fields=['sort_title'], title='Document title',
default_order='asc'
)

RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict(
fields=['-sort_date_new'], title='Document date (newest)',
default_order='desc'
)

RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict(
fields=['sort_date_old'], title='Document date (oldest)',
default_order='asc'
)

# ------ HOLDINGS SORT
RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict(
fields=['library.pid', 'location.pid'],
Expand All @@ -2214,7 +2262,24 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['holdings'] = dict(
query='bestmatch', noquery='library_location')

# ------ ITEM SORT
# ------ ITEMS SORT
RECORDS_REST_SORT_OPTIONS['items']['barcode'] = dict(
fields=['barcode'], title='Barcode',
default_order='asc'
)

RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict(
fields=['call_number'], title='Call Number',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict(
fields=['second_call_number'], title='Second call Number',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
fields=['issue.expected_date'], title='Issue expected date',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict(
fields=['-enumerationAndChronology'], title='Enumeration and Chronology',
default_order='desc'
Expand All @@ -2239,6 +2304,10 @@ def _(x):
fields=['library_name'], title='Library name',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict(
fields=['code'], title='Library code',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['libraries'] = dict(
query='bestmatch', noquery='name')

Expand Down Expand Up @@ -2282,19 +2351,6 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['patron_types'] = dict(
query='bestmatch', noquery='name')

# ------ VENDORS SORT
RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
fields=['vendor_name'], title='Vendor name',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
query='bestmatch', noquery='name')

# ------ ITEMS SORT
RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
fields=['issue.expected_date'], title='Issue expected date',
default_order='asc'
)
# ------ TEMPLATES SORT
RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict(
fields=['name_sort'], title='Template name',
Expand All @@ -2303,18 +2359,14 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['templates'] = dict(
query='bestmatch', noquery='name')

# ------ COLLECTIONS SORT
RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
fields=['start_date', 'title_sort'], title='Start date and title',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
fields=['title_sort'], title='title',
# ------ VENDORS SORT
RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
fields=['vendor_name'], title='Vendor name',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
query='bestmatch', noquery='name')

RECORDS_REST_DEFAULT_SORT['collections'] = dict(
query='bestmatch', noquery='start_date')

# Detailed View Configuration
# ===========================
Expand Down Expand Up @@ -2805,3 +2857,43 @@ def _(x):

# OAuth base template
OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html'

# STOP WORDS
# Disregarded articles for sorting processes
# ==========
# ACTIVATE STOP WORDS NORMALIZATION
RERO_ILS_STOP_WORDS_ACTIVATE = True
# PUNCTUATION
RERO_ILS_STOP_WORDS_PUNCTUATION = [
r'\[', r'\]', '"', ',', ';', ':', r'\.', '_',
r'\?', r'\!', r'\*', r'\+', '\n'
]
# STOP WORDS BY LANGUAGE
# Possibility to add a default configuration with a "default" entry.
# This default configuration will be used if the language is not present
RERO_ILS_STOP_WORDS = {
'dan': ["de", "den", "det", "en", "et"],
'dut': [
"d'", "de", "den", "der", "des", "het", "'s", "'t", "een",
"eene", "eener", "eens", "ene", "'n"],
'eng': ["a", "an", "the"],
'epo': ["la", "l'", "unu"],
'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"],
'ger': [
"das", "dem", "den", "der", "des", "die",
"ein", "eine", "einem", "einen", "einer", "eines"],
'hun': [ "a", "az", "egy"],
'ita': [
"gli", "i", "il", "l'", "la", "le", "li", "lo",
"un", "un'", "una", "uno"],
'nor': ["de", "dei", "den", "det", "ei", "en", "et"],
'por': ["a", "as", "o", "os", "um", "uma", "umas", "uns"],
'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"],
'swe': ["de", "den", "det", "en", "ett"]
}

# LANGUAGE MAPPING
# ================
RERO_ILS_LANGUAGE_MAPPING = {
'dum': 'dut' # neerlandais
}
51 changes: 51 additions & 0 deletions rero_ils/es_templates/v7/record.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
"max_result_window": "100000",
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 10
},
"french_elision": {
"type": "elision",
"articles_case": true,
Expand All @@ -25,6 +30,32 @@
"puisqu"
]
},
"italian_elision": {
"type": "elision",
"articles": [
"c", "l", "all", "dall", "dell",
"nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl",
"sugl", "un", "m", "t", "s", "v", "d"
],
"articles_case": true
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"german_stop": {
"type": "stop",
"stopwords": "_german_"
},
"italian_stop": {
"type": "stop",
"stopwords": "_italian_"
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
Expand Down Expand Up @@ -65,6 +96,26 @@
"icu_folding",
"german_normalization"
]
},
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"icu_normalizer",
"icu_folding",
"french_elision",
"italian_elision",
"edge_ngram_filter"
]
}
},
"normalizer": {
"sort_normalizer": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"gnd_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"idref_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"rero_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"$schema": {
"type": "keyword"
},
Expand Down Expand Up @@ -100,7 +112,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"gnd_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down Expand Up @@ -203,7 +218,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"idref_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down Expand Up @@ -306,7 +324,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"rero_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down
24 changes: 22 additions & 2 deletions rero_ils/modules/documents/listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

"""Signals connector for Document."""

from flask.globals import current_app
from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13

from .utils import create_contributions, title_format_text_head
Expand All @@ -26,6 +27,7 @@
from ..items.models import ItemNoteTypes
from ..local_fields.api import LocalField
from ..utils import extracted_data_from_ref
from ...utils import language_mapping


def enrich_document_data(sender, json=None, record=None, index=None,
Expand Down Expand Up @@ -139,17 +141,24 @@ def enrich_document_data(sender, json=None, record=None, index=None,
)
json['title'].append(title)

json['sort_title'] = title_format_text_head(
# sort title
sort_title = title_format_text_head(
json.get('title', []),
with_subtitle=True
)
language = language_mapping(json.get('language')[0].get('value'))
if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
sort_title = current_app.\
extensions['reroils-normalizer-stop-words'].\
normalize(sort_title, language)
json['sort_title'] = sort_title
# Local fields in JSON
local_fields = LocalField.get_local_fields_by_resource(
'doc', document_pid)
if local_fields:
json['local_fields'] = local_fields
# index both ISBN 10 and 13 format

# index both ISBN 10 and 13 format
def filter_isbn(identified_by):
"""Filter identified_by for type bf:Isbn."""
return identified_by.get('type') == 'bf:Isbn'
Expand All @@ -168,3 +177,14 @@ def filter_isbn(identified_by):
isbns.add(to_isbn10(isbn))
if isbns:
json['isbn'] = list(isbns)

# Populate sort date new and old for use in sorting
pub_provisions = [
p for p in record.get('provisionActivity', [])
if p['type'] == 'bf:Publication'
]
pub_provision = next(iter(pub_provisions), None)
if pub_provision:
json['sort_date_new'] = \
pub_provision.get('endDate', pub_provision.get('startDate'))
json['sort_date_old'] = pub_provision.get('startDate')
Loading

0 comments on commit 8980371

Please sign in to comment.