Skip to content

Commit

Permalink
data: implement stop words
Browse files Browse the repository at this point in the history
* Adds a stop word based normalizer.
* Adds fields on the documents for sorting.
* Adds fields on the contributions for sorting.
* Adds elasticsearch configuration for sorting on resources.
* Closes rero#2396.

Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
  • Loading branch information
Garfield-fr committed Sep 28, 2021
1 parent a35fc49 commit d7f0e08
Show file tree
Hide file tree
Showing 8 changed files with 344 additions and 52 deletions.
117 changes: 94 additions & 23 deletions rero_ils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,58 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['circ_policies'] = dict(
query='bestmatch', noquery='name')

# ------ COLLECTIONS SORT
RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
fields=['start_date', 'title_sort'], title='Start date and title',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
fields=['title_sort'], title='title',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['collections'] = dict(
query='bestmatch', noquery='start_date')

# ------ CONTRIBUTIONS SORT
RECORDS_REST_SORT_OPTIONS['contributions']['fr_name'] = dict(
fields=[
'idref_authorized_access_point_sort',
'rero_authorized_access_point_sort',
'gnd_authorized_access_point_sort',
],
title='Collection french name',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['contributions']['de_name'] = dict(
fields=[
'gnd_authorized_access_point_sort',
'idref_authorized_access_point_sort',
'rero_authorized_access_point_sort'
],
title='Collection german name',
default_order='asc'
)

# ------ DOCUMENTS SORT
RECORDS_REST_SORT_OPTIONS['documents']['title'] = dict(
fields=['sort_title'], title='Document title',
default_order='asc'
)

RECORDS_REST_SORT_OPTIONS['documents']['pub_date_new'] = dict(
fields=[
'-provision_activity_end_date',
'-provision_activity_start_date'
],
title='Document date (newest)',
default_order='desc'
)

RECORDS_REST_SORT_OPTIONS['documents']['pub_date_old'] = dict(
fields=['provision_activity_start_date'], title='Document date (oldest)',
default_order='asc'
)

# ------ HOLDINGS SORT
RECORDS_REST_SORT_OPTIONS['holdings']['library_location'] = dict(
fields=['library.pid', 'location.pid'],
Expand All @@ -2214,7 +2266,19 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['holdings'] = dict(
query='bestmatch', noquery='library_location')

# ------ ITEM SORT
# ------ ITEMS SORT
RECORDS_REST_SORT_OPTIONS['items']['call_number'] = dict(
fields=['call_number'], title='Call Number',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['second_call_number'] = dict(
fields=['second_call_number'], title='Second call Number',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
fields=['issue.expected_date'], title='Issue expected date',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['items']['enumeration_chronology'] = dict(
fields=['-enumerationAndChronology'], title='Enumeration and Chronology',
default_order='desc'
Expand All @@ -2239,6 +2303,10 @@ def _(x):
fields=['library_name'], title='Library name',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['libraries']['code'] = dict(
fields=['code'], title='Library code',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['libraries'] = dict(
query='bestmatch', noquery='name')

Expand Down Expand Up @@ -2282,19 +2350,6 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['patron_types'] = dict(
query='bestmatch', noquery='name')

# ------ VENDORS SORT
RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
fields=['vendor_name'], title='Vendor name',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
query='bestmatch', noquery='name')

# ------ ITEMS SORT
RECORDS_REST_SORT_OPTIONS['items']['issue_expected_date'] = dict(
fields=['issue.expected_date'], title='Issue expected date',
default_order='asc'
)
# ------ TEMPLATES SORT
RECORDS_REST_SORT_OPTIONS['templates']['name'] = dict(
fields=['name_sort'], title='Template name',
Expand All @@ -2303,18 +2358,14 @@ def _(x):
RECORDS_REST_DEFAULT_SORT['templates'] = dict(
query='bestmatch', noquery='name')

# ------ COLLECTIONS SORT
RECORDS_REST_SORT_OPTIONS['collections']['start_date'] = dict(
fields=['start_date', 'title_sort'], title='Start date and title',
default_order='asc'
)
RECORDS_REST_SORT_OPTIONS['collections']['title'] = dict(
fields=['title_sort'], title='title',
# ------ VENDORS SORT
RECORDS_REST_SORT_OPTIONS['vendors']['name'] = dict(
fields=['vendor_name'], title='Vendor name',
default_order='asc'
)
RECORDS_REST_DEFAULT_SORT['vendors'] = dict(
query='bestmatch', noquery='name')

RECORDS_REST_DEFAULT_SORT['collections'] = dict(
query='bestmatch', noquery='start_date')

# Detailed View Configuration
# ===========================
Expand Down Expand Up @@ -2806,3 +2857,23 @@ def _(x):

# OAuth base template
OAUTH2SERVER_COVER_TEMPLATE = 'rero_ils/oauth/base.html'

# STOP WORDS
# ==========
# ACTIVATE STOP WORDS NORMALIZATION
RERO_ILS_STOP_WORDS_ACTIVATE = True
# PUNCTUATION
RERO_ILS_STOP_WORDS_PUNCTUATION = [
'"', ';', ':', '\\.', '-', '_', '\\?', '\\!', '\\*', '\\+', '\n'
]
# STOP WORDS BY LANGUAGE
# Possibility to add a default configuration with a "default" entry
RERO_ILS_STOP_WORDS = {
'eng': ["a", "an", "the"],
'fre': ["de", "des", "du", "l'", "la", "le", "les", "un", "une"],
'ger': ["das", "dem", "den", "der", "des", "die"],
'ita': [
"gli", "i", "il", "l'", "la", "le", "li", "lo", "un", "un'",
"una", "uno"],
'spa': ["el", "la", "las", "lo", "los", "un", "una", "unas", "unos"]
}
51 changes: 51 additions & 0 deletions rero_ils/es_templates/v7/record.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
"max_result_window": "100000",
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 3,
"max_gram": 10
},
"french_elision": {
"type": "elision",
"articles_case": true,
Expand All @@ -25,6 +30,32 @@
"puisqu"
]
},
"italian_elision": {
"type": "elision",
"articles": [
"c", "l", "all", "dall", "dell",
"nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl",
"sugl", "un", "m", "t", "s", "v", "d"
],
"articles_case": true
},
"english_stop": {
"type": "stop",
"stopwords": "_english_"
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"german_stop": {
"type": "stop",
"stopwords": "_german_"
},
"italian_stop": {
"type": "stop",
"stopwords": "_italian_"
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
Expand Down Expand Up @@ -65,6 +96,26 @@
"icu_folding",
"german_normalization"
]
},
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"icu_normalizer",
"icu_folding",
"french_elision",
"italian_elision",
"edge_ngram_filter"
]
}
},
"normalizer": {
"sort_normalizer": {
"type": "custom",
"filter": [
"lowercase"
]
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@
"analyzer": "autocomplete",
"search_analyzer": "standard"
},
"gnd_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"idref_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"rero_authorized_access_point_sort": {
"type": "keyword",
"normalizer": "sort_normalizer"
},
"$schema": {
"type": "keyword"
},
Expand Down Expand Up @@ -100,7 +112,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"gnd_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down Expand Up @@ -203,7 +218,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"idref_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down Expand Up @@ -306,7 +324,10 @@
},
"authorized_access_point": {
"type": "text",
"copy_to": "autocomplete_name"
"copy_to": [
"autocomplete_name",
"rero_authorized_access_point_sort"
]
},
"qualifier": {
"type": "keyword"
Expand Down
26 changes: 24 additions & 2 deletions rero_ils/modules/documents/listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

"""Signals connector for Document."""

from flask.globals import current_app
from isbnlib import is_isbn10, is_isbn13, to_isbn10, to_isbn13

from .utils import create_contributions, title_format_text_head
Expand Down Expand Up @@ -139,17 +140,24 @@ def enrich_document_data(sender, json=None, record=None, index=None,
)
json['title'].append(title)

json['sort_title'] = title_format_text_head(
# sort title
sort_title = title_format_text_head(
json.get('title', []),
with_subtitle=True
)
language = json.get('language')[0].get('value')
if current_app.config.get('RERO_ILS_STOP_WORDS_ACTIVATE', False):
sort_title = current_app.\
extensions['reroils-normalizer-stop-words'].\
normalize(sort_title, language)
json['sort_title'] = sort_title
# Local fields in JSON
local_fields = LocalField.get_local_fields_by_resource(
'doc', document_pid)
if local_fields:
json['local_fields'] = local_fields
# index both ISBN 10 and 13 format

# index both ISBN 10 and 13 format
def filter_isbn(identified_by):
"""Filter identified_by for type bf:Isbn."""
return identified_by.get('type') == 'bf:Isbn'
Expand All @@ -168,3 +176,17 @@ def filter_isbn(identified_by):
isbns.add(to_isbn10(isbn))
if isbns:
json['isbn'] = list(isbns)

# Populate startDate and endDate for use in sorting
pub_provisions = [
p for p in record.get('provisionActivity', [])
if p['type'] == 'bf:Publication'
]
pub_provision = next(iter(pub_provisions), None)
if pub_provision:
if 'startDate' in pub_provision:
json['provision_activity_start_date'] = \
pub_provision['startDate']
if 'endDate' in pub_provision:
json['provision_activity_end_date'] = \
pub_provision['endDate']
Original file line number Diff line number Diff line change
@@ -1,25 +1,4 @@
{
"settings": {
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 20
}
},
"analyzer": {
"autocomplete": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"autocomplete_filter"
]
}
}
}
},
"mappings": {
"date_detection": false,
"numeric_detection": false,
Expand Down Expand Up @@ -154,11 +133,11 @@
},
"autocomplete_title": {
"type": "text",
"analyzer": "autocomplete",
"search_analyzer": "standard"
"analyzer": "autocomplete"
},
"sort_title": {
"type": "keyword"
"type": "keyword",
"normalizer": "sort_normalizer"
},
"responsibilityStatement": {
"type": "object",
Expand Down Expand Up @@ -368,6 +347,12 @@
}
}
},
"provision_activity_start_date": {
"type": "integer"
},
"provision_activity_end_date": {
"type": "integer"
},
"provisionActivity": {
"type": "object",
"properties": {
Expand Down
Loading

0 comments on commit d7f0e08

Please sign in to comment.