Skip to content

Commit

Permalink
Add validators and presets for tag_string normalization (normalize_ta…
Browse files Browse the repository at this point in the history
…g_string_autocomplete)
  • Loading branch information
mjanez committed Nov 12, 2024
1 parent eba6c0d commit e606499
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 2 deletions.
1 change: 1 addition & 0 deletions ckanext/schemingdcat/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
'slugify_pat',
'URL_REGEX',
'INVALID_CHARS',
'TAGS_NORMALIZE_PATTERN',
'ACCENT_MAP',
'COMMON_DATE_FORMATS'
]
1 change: 1 addition & 0 deletions ckanext/schemingdcat/config/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

# Compile the regular expression
INVALID_CHARS = re.compile(r"[^a-zñ0-9_.-]")
TAGS_NORMALIZE_PATTERN = re.compile(r'[^a-záéíóúüñ0-9\-_\.]')

# Define a dictionary to map accented characters to their unaccented equivalents except ñ
ACCENT_MAP = str.maketrans({
Expand Down
31 changes: 31 additions & 0 deletions ckanext/schemingdcat/schemas/default_presets.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@
}
}
},
{
"preset_name": "normalize_tag_string_autocomplete",
"values": {
"validators": "ignore_missing normalize_tag_strings tag_string_convert",
"classes": ["control-full"],
"form_attrs": {
"data-module": "autocomplete",
"data-module-tags": "",
"data-module-source": "/api/2/util/tag/autocomplete?incomplete=?",
"class": ""
}
}
},
{
"preset_name": "tag_string_uris",
"values": {
Expand Down Expand Up @@ -307,6 +320,24 @@
"output_validators": "scheming_load_json"
}
},
{
"preset_name": "required_multiple_text_raws_ordered",
"values": {
"form_snippet": "schemingdcat/form_snippets/multiple_text.html",
"display_snippet": "schemingdcat/display_snippets/list_raws_ordered.html",
"validators": "not_empty scheming_required schemingdcat_multiple_text",
"output_validators": "scheming_load_json"
}
},
{
"preset_name": "required_multiple_text_links",
"values": {
"form_snippet": "schemingdcat/form_snippets/multiple_text.html",
"display_snippet": "schemingdcat/display_snippets/list_links.html",
"validators": "not_empty scheming_required schemingdcat_multiple_text",
"output_validators": "scheming_load_json"
}
},
{
"preset_name": "markdown",
"values": {
Expand Down
74 changes: 72 additions & 2 deletions ckanext/schemingdcat/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import six
import mimetypes
from shapely.geometry import shape, Polygon
from functools import lru_cache

import ckanext.scheming.helpers as sh
import ckanext.schemingdcat.helpers as helpers
Expand Down Expand Up @@ -35,7 +36,8 @@
from ckanext.schemingdcat.config import (
OGC2CKAN_HARVESTER_MD_CONFIG,
mimetype_base_uri,
DCAT_AP_HVD_CATEGORY_LEGISLATION
DCAT_AP_HVD_CATEGORY_LEGISLATION,
TAGS_NORMALIZE_PATTERN
)

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -1158,4 +1160,72 @@ def validator(key, data, errors, context):
if data.get(key) != DCAT_AP_HVD_CATEGORY_LEGISLATION:
data[key] = [DCAT_AP_HVD_CATEGORY_LEGISLATION]

return validator
return validator

@scheming_validator
@validator
def normalize_tag_strings(field, schema):
"""
Normalizes the value of a specified tag_string and tags before tag_string_convert validator using the rules determined by normalize_string
Args:
field (dict): Information about the field to update.
schema (dict): The schema for the field to update.
Returns:
function: A validation function to normalize the value of the key.
"""
log.debug('miteco_normalize_tag_string: %s', field)

def validator(key, data, errors, context):
value = data.get(key)

try:
if value:
normalized_values = []
if isinstance(value, str):
tags = value.split(',')
normalized_values = [normalize_string(tag.strip()) for tag in tags]
data[key] = ','.join(normalized_values)
elif isinstance(value, list):
for tag in value:
if 'name' in tag:
tag['name'] = normalize_string(tag['name'].strip())
if 'display_name' in tag:
tag['display_name'] = normalize_string(tag['display_name'].strip())

# Normalize the tags in data
for data_key in data.keys():
if isinstance(data_key, tuple) and data_key[0] == 'tags' and data_key[2] == 'name':
data[data_key] = normalize_string(data[data_key].strip())

except Exception as e:
log.error(f"Error normalizing tags: {e}")

return validator

@staticmethod
@lru_cache(maxsize=44)
def normalize_string(s):
"""Normalizes a string according to the rules:
- Replaces spaces with hyphens.
- Converts to lowercase.
- Removes disallowed characters.
- Normalize to using only alphanumeric and spanish accents (áéíóúüñ) or hyphens "-", underscores "_" and dots "."
- Limits the length to 30 characters.
Args:
s (str): String to normalize.
Returns:
str: Normalized string.
Raises:
Invalid: If the string contains disallowed characters.
"""
s = s.strip()
s = s.lower()
s = s.replace(' ', '-')
s = TAGS_NORMALIZE_PATTERN.sub('', s)
s = s[:30]
return s

0 comments on commit e606499

Please sign in to comment.