Add validators and presets for tag_string normalization (normalize_ta…

…g_string_autocomplete)
mjanez · Nov 12, 2024 · e606499 · e606499
1 parent eba6c0d
commit e606499
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 2 deletions.
diff --git a/ckanext/schemingdcat/config/__init__.py b/ckanext/schemingdcat/config/__init__.py
@@ -54,6 +54,7 @@
     'slugify_pat',
     'URL_REGEX',
     'INVALID_CHARS',
+    'TAGS_NORMALIZE_PATTERN',
     'ACCENT_MAP',
     'COMMON_DATE_FORMATS'
 ]
diff --git a/ckanext/schemingdcat/config/tools.py b/ckanext/schemingdcat/config/tools.py
@@ -26,6 +26,7 @@
 
 # Compile the regular expression
 INVALID_CHARS = re.compile(r"[^a-zñ0-9_.-]")
+TAGS_NORMALIZE_PATTERN = re.compile(r'[^a-záéíóúüñ0-9\-_\.]')
 
 # Define a dictionary to map accented characters to their unaccented equivalents except ñ
 ACCENT_MAP = str.maketrans({

diff --git a/ckanext/schemingdcat/schemas/default_presets.json b/ckanext/schemingdcat/schemas/default_presets.json
@@ -39,6 +39,19 @@
         }
       }
     },
+    {
+      "preset_name": "normalize_tag_string_autocomplete",
+      "values": {
+        "validators": "ignore_missing normalize_tag_strings tag_string_convert",
+        "classes": ["control-full"],
+        "form_attrs": {
+          "data-module": "autocomplete",
+          "data-module-tags": "",
+          "data-module-source": "/api/2/util/tag/autocomplete?incomplete=?",
+          "class": ""
+        }
+      }
+    },
     {
       "preset_name": "tag_string_uris",
       "values": {
@@ -307,6 +320,24 @@
         "output_validators": "scheming_load_json"
       }
     },
+    {
+      "preset_name": "required_multiple_text_raws_ordered",
+      "values": {
+        "form_snippet": "schemingdcat/form_snippets/multiple_text.html",
+        "display_snippet": "schemingdcat/display_snippets/list_raws_ordered.html",
+        "validators": "not_empty scheming_required schemingdcat_multiple_text",
+        "output_validators": "scheming_load_json"
+      }
+    },
+    {
+      "preset_name": "required_multiple_text_links",
+      "values": {
+        "form_snippet": "schemingdcat/form_snippets/multiple_text.html",
+        "display_snippet": "schemingdcat/display_snippets/list_links.html",
+        "validators": "not_empty scheming_required schemingdcat_multiple_text",
+        "output_validators": "scheming_load_json"
+      }
+    },
     {
       "preset_name": "markdown",
       "values": {

diff --git a/ckanext/schemingdcat/validators.py b/ckanext/schemingdcat/validators.py
@@ -3,6 +3,7 @@
 import six
 import mimetypes
 from shapely.geometry import shape, Polygon
+from functools import lru_cache
 
 import ckanext.scheming.helpers as sh
 import ckanext.schemingdcat.helpers as helpers
@@ -35,7 +36,8 @@
 from ckanext.schemingdcat.config import (
     OGC2CKAN_HARVESTER_MD_CONFIG,
     mimetype_base_uri,
-    DCAT_AP_HVD_CATEGORY_LEGISLATION
+    DCAT_AP_HVD_CATEGORY_LEGISLATION,
+    TAGS_NORMALIZE_PATTERN
 )
 
 log = logging.getLogger(__name__)
@@ -1158,4 +1160,72 @@ def validator(key, data, errors, context):
                 if data.get(key) != DCAT_AP_HVD_CATEGORY_LEGISLATION:
                     data[key] = [DCAT_AP_HVD_CATEGORY_LEGISLATION]
 
-    return validator
+    return validator
+
+@scheming_validator
+@validator
+def normalize_tag_strings(field, schema):
+    """
+    Normalizes the value of a specified tag_string and tags before tag_string_convert validator using the rules determined by normalize_string
+
+    Args:
+        field (dict): Information about the field to update.
+        schema (dict): The schema for the field to update.
+
+    Returns:
+        function: A validation function to normalize the value of the key.
+    """
+    log.debug('miteco_normalize_tag_string: %s', field)
+
+    def validator(key, data, errors, context):
+        value = data.get(key)
+
+        try:
+            if value:
+                normalized_values = []
+                if isinstance(value, str):
+                    tags = value.split(',')
+                    normalized_values = [normalize_string(tag.strip()) for tag in tags]
+                    data[key] = ','.join(normalized_values)
+                elif isinstance(value, list):
+                    for tag in value:
+                        if 'name' in tag:
+                            tag['name'] = normalize_string(tag['name'].strip())
+                        if 'display_name' in tag:
+                            tag['display_name'] = normalize_string(tag['display_name'].strip())
+
+            # Normalize the tags in data
+            for data_key in data.keys():
+                if isinstance(data_key, tuple) and data_key[0] == 'tags' and data_key[2] == 'name':
+                    data[data_key] = normalize_string(data[data_key].strip())
+
+        except Exception as e:
+            log.error(f"Error normalizing tags: {e}")
+
+    return validator
+
+@staticmethod
+@lru_cache(maxsize=44)
+def normalize_string(s):
+    """Normalizes a string according to the rules:
+        - Replaces spaces with hyphens.
+        - Converts to lowercase.
+        - Removes disallowed characters.
+        - Normalize to using only alphanumeric and spanish accents (áéíóúüñ) or hyphens "-", underscores "_" and dots "."
+        - Limits the length to 30 characters.
+
+    Args:
+        s (str): String to normalize.
+
+    Returns:
+        str: Normalized string.
+
+    Raises:
+        Invalid: If the string contains disallowed characters.
+    """    
+    s = s.strip()
+    s = s.lower()
+    s = s.replace(' ', '-')
+    s = TAGS_NORMALIZE_PATTERN.sub('', s)
+    s = s[:30]
+    return s