Skip to content

Commit

Permalink
Fixed schemingdcat and add custom_formats
Browse files Browse the repository at this point in the history
  • Loading branch information
mjanez committed Mar 21, 2024
1 parent dcb0bdc commit b6a3a07
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ pdm run python ogc2ckan/ogc2ckan.py
## Additional info
### CKAN Schemas
The CKAN output schemas are located in the [`ogc2ckan/ckan_datasets`](./ogc2ckan/ckan_datasets) folder. The schemas are used to map the metadata fields from the different sources to the CKAN dataset fields. Now are available the following schemas:
* `geodcatap`: Schema based in [GeoDCAT-AP Schema for CKAN](https://github.com/mjanez/ckanext-scheming_dcat).
* `geodcatap`: Schema based in [GeoDCAT-AP Schema for CKAN](https://github.com/mjanez/ckanext-schemingdcat).
* `base`: A DCAT schema with the basic fields.

You can create your own Schema.
Expand Down
2 changes: 1 addition & 1 deletion ogc2ckan/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,7 +1088,7 @@ def _clean_name(name):

# Replace accented and special characters with their unaccented equivalents or _
name = ''.join(accent_map.get(c, c) for c in name)
name = re.sub(r'[^a-zñ0-9_.-]', '_', name.lower().strip())
name = re.sub(r'[^a-zñ0-9_.-]', '-', name.lower().strip())

# Truncate the name to 40 characters
name = name[:40]
Expand Down
24 changes: 15 additions & 9 deletions ogc2ckan/harvesters/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# custom functions
from config.ogc2ckan_config import get_log_module
from mappings.default_ogc2ckan_config import OGC2CKAN_HARVESTER_CONFIG, OGC2CKAN_HARVESTER_MD_CONFIG, OGC2CKAN_CKANINFO_CONFIG
from mappings.default_ogc2ckan_config import OGC2CKAN_HARVESTER_CONFIG, OGC2CKAN_HARVESTER_MD_CONFIG, OGC2CKAN_CKANINFO_CONFIG, CUSTOM_FORMAT_RULES
from controller.mapping import get_df_mapping_json

log_module = get_log_module(os.path.abspath(__file__))
Expand Down Expand Up @@ -409,7 +409,7 @@ def get_distribution(self, ckan_info: CKANInfo, dataset, distribution, datadicti
'id': distribution_id,
'url': r.get('url', ''),
'name': dist_name,
'format': self._update_custom_formats(format_type, r.get('url', '')),
'format': self._update_custom_format(format_type, r.get('url', '')),
'media_type': media_type,
'description': r.get('description', ''),
'license': r.get('license', ckan_info.default_license),
Expand Down Expand Up @@ -447,11 +447,12 @@ def get_datadictionary(self, datadictionary, datadictionaryfield, table_datadict


@staticmethod
def _update_custom_formats(format, url=None, **args):
"""Update the custom format.
def _update_custom_format(format, url=None, **args):
"""Update the custom format based on custom rules.
If the format contains 'esri' or 'arcgis' (case-insensitive) or the URL contains 'viewer.html?url=',
the format is updated to 'HTML'.
The function checks the format and URL against a set of custom rules (CUSTOM_FORMAT_RULES). If a rule matches,
the format is updated according to that rule. This function is designed to be easily
extendable with new rules.
Args:
format (str): The custom format to update.
Expand All @@ -461,9 +462,14 @@ def _update_custom_formats(format, url=None, **args):
Returns:
str: The updated custom format.
"""
if isinstance(format, str) and (any(string in format.lower() for string in ['esri', 'arcgis']) or 'viewer.html?url=' in url):
format = 'HTML'

if isinstance(format, str):
format_lower = format.lower()

for rule in CUSTOM_FORMAT_RULES:
if any(string in format_lower for string in rule['format_strings']) or rule['url_string'] in url:
format = rule['new_format']
break

return format

@staticmethod
Expand Down
11 changes: 10 additions & 1 deletion ogc2ckan/mappings/default_ogc2ckan_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,13 @@
}

# loose definition of BCP47-like strings
BCP_47_LANGUAGE = u'^[a-z]{2,8}(-[0-9a-zA-Z]{1,8})*$'
BCP_47_LANGUAGE = u'^[a-z]{2,8}(-[0-9a-zA-Z]{1,8})*$'

CUSTOM_FORMAT_RULES = [
{
'format_strings': ['esri', 'arcgis'],
'url_string': 'viewer.html?url=',
'new_format': 'HTML'
},
# Add more rules here as needed
]
2 changes: 1 addition & 1 deletion ogc2ckan/ogc2ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def start_harvesting(config_file):
logging.warning(f"{log_module}:[INSECURE] SSL_UNVERIFIED_MODE:'{ckan_info.ssl_unverified_mode}'. Only if you trust the host.")

if ckan_info.metadata_distributions == True or ckan_info.metadata_distributions == "True":
logging.warning(f"{log_module}:METADATA_DISTRIBUTIONS:'{ckan_info.metadata_distributions}'. It is not necessary if you do not intend to generate distributions for geographic metadata (INSPIRE ISO19139) or Linked Open Data (GeoDCAT-AP). ckanext-scheming_dcat already links the most important metadata profiles (https://github.com/mjanez/ckanext-scheming_dcat).")
logging.warning(f"{log_module}:METADATA_DISTRIBUTIONS:'{ckan_info.metadata_distributions}'. It is not necessary if you do not intend to generate distributions for geographic metadata (INSPIRE ISO19139) or Linked Open Data (GeoDCAT-AP). ckanext-schemingdcat already links the most important metadata profiles (https://github.com/mjanez/ckanext-schemingdcat).")

logging.info(f"{log_module}:Type of activated harvesters: {', '.join([f'{h.upper()}' for h in active_harvesters])}")
logging.info(f"{log_module}:CKAN_URL: {ckan_info.ckan_site_url}")
Expand Down

0 comments on commit b6a3a07

Please sign in to comment.