Skip to content

Commit

Permalink
Merge pull request #26 from mjanez/main
Browse files Browse the repository at this point in the history
Update latest branch
  • Loading branch information
mjanez authored Feb 14, 2024
2 parents eed6020 + dcb0bdc commit 829801f
Show file tree
Hide file tree
Showing 9 changed files with 134 additions and 69 deletions.
6 changes: 6 additions & 0 deletions ogc2ckan/ckan_datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def __init__(self, ckan_id, name, owner_org, license_id):
self.owner_org = owner_org
self.private = False
self.groups = []
self.graphic_overview = None
# use http://<ckan_url>/api/action/organization_list to see the organization ids in your CKAN site
self.license_id = license_id
self.identifier = None
Expand Down Expand Up @@ -142,6 +143,9 @@ def set_private(self, private):
def set_groups(self, groups):
self.groups = groups

def set_graphic_overview(self, graphic_overview):
self.graphic_overview = graphic_overview

def set_publisher_uri(self, publisher_uri):
self.publisher_uri = publisher_uri

Expand Down Expand Up @@ -351,6 +355,7 @@ def dataset_dict(self):
'owner_org': self.owner_org,
'private': self.private,
'groups': self.groups,
'graphic_overview': self.graphic_overview,
'title': self.title,
'notes': self.notes,
'license_id': self.license_id,
Expand Down Expand Up @@ -448,6 +453,7 @@ def dataset_dict(self):
'owner_org': self.owner_org,
'private': self.private,
'groups': self.groups,
'graphic_overview': self.graphic_overview,
'title_translated': self.title_translated,
'notes_translated': self.notes_translated,
'license_id': self.license_id,
Expand Down
6 changes: 6 additions & 0 deletions ogc2ckan/ckan_datasets/geodcatap.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(self, ckan_id, name, owner_org, license_id):
self.owner_org = owner_org
self.private = False
self.groups = []
self.graphic_overview = None
# use http://<ckan_url>/api/action/organization_list to see the organization ids in your CKAN site
self.license_id = license_id
self.identifier = ckan_id
Expand Down Expand Up @@ -181,6 +182,9 @@ def set_private(self, private):
def set_groups(self, groups):
self.groups = groups

def set_graphic_overview(self, graphic_overview):
self.graphic_overview = graphic_overview

def set_publisher_uri(self, publisher_uri):
self.publisher_uri = publisher_uri

Expand Down Expand Up @@ -402,6 +406,7 @@ def dataset_dict(self):
'owner_org': self.owner_org,
'private': self.private,
'groups': self.groups,
'graphic_overview': self.graphic_overview,
'title': self.title,
'notes': self.notes,
'license_id': self.license_id,
Expand Down Expand Up @@ -505,6 +510,7 @@ def dataset_dict_multilang(self):
'owner_org': self.owner_org,
'private': self.private,
'groups': self.groups,
'graphic_overview': self.graphic_overview,
'title_translated': self.title_translated,
'notes_translated': self.notes_translated,
'license_id': self.license_id,
Expand Down
18 changes: 16 additions & 2 deletions ogc2ckan/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,20 @@ def _get_ckan_name(name, organization):

return ckan_name

@staticmethod
def _normalize_id(id):
# the id of a CKAN dataset, must be between 2 and 36 characters long and contain only lowercase
# alphanumeric characters, - and _, e.g. 'warandpeace'
normal = unicodedata.normalize('NFKD', id).encode('ASCII', 'ignore').decode('utf-8')
ckan_name = re.sub(r'[^a-z0-9_-]', '_', normal.lower())[:36]
if len(ckan_name) == 0:
ckan_name = 'unnamed'
elif ckan_name[0].isdigit():
ckan_name = 'n' + ckan_name[1:]
ckan_name = ckan_name.lower()

return ckan_name

@staticmethod
def _normalize_date(date):
if isinstance(date, str):
Expand Down Expand Up @@ -639,7 +653,7 @@ def _get_ckan_format(dist_info):
informat = ''.join(str(value) for value in dist_info.values()).lower()
informat = next((key for key in OGC2CKAN_MD_FORMATS if key.lower() in informat), dist_info.get('url', '').lower())
except:
informat = dist_info['url'].lower()
informat = dist_info['url'].lower() if isinstance(dist_info['url'], str) else dist_info['url']

return OGC2CKAN_MD_FORMATS.get(informat, (None, None, None, None))

Expand Down Expand Up @@ -928,7 +942,7 @@ def set_translated_fields(self, dataset, source_data: object, source_language=No
source_language = source_language if "http" in source_language else None
default_language = source_language if source_language is not None and source_language != self.default_language else self.default_language

required_lang = get_mapping_value(default_language, 'language', 'iso_639_2')
required_lang = get_mapping_value(default_language, 'language', 'iso_639_1')

for field, field_translated in OGC2CKAN_MD_MULTILANG_FIELDS.items():
output = {}
Expand Down
2 changes: 1 addition & 1 deletion ogc2ckan/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str):
dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
dataset.set_resource_type(
dcat_type['series'] if layer_info.hierarchy == "series"
else dcat_type['service'] if layer_info.hierarchy == "service"
else dcat_type['spatial_data_service'] if layer_info.hierarchy == "service"
else dcat_type['dataset'])

# Set SpatialRepresentationType
Expand Down
60 changes: 43 additions & 17 deletions ogc2ckan/harvesters/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ def get_file_by_extension(self, harvester_formats):

logging.info(f"{log_module}:Load '{self.file_extension.upper()}' file: '{filename}' with {len(table_data)} records")

# Clean column names by removing leading/trailing whitespaces, newlines, and tabs
table_data.columns = table_data.columns.str.strip().str.replace('\n', '').str.replace('\t', '')

# Remove all fields that are a nan float and trim all spaces of the values
table_data = table_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
table_data = table_data.fillna(value='')
Expand All @@ -90,18 +93,32 @@ def get_file_by_extension(self, harvester_formats):
table_distributions = table_distributions.rename(columns=lambda x: x.replace('resource_', ''))
table_datadictionaries = table_datadictionaries.rename(columns=lambda x: re.sub(re.compile(r'datadictionary(_info)?_'), '', x).replace('info.', ''))

# Group distributions by dataset_id and convert to list of dicts
table_distributions_grouped = table_distributions.groupby('dataset_id').apply(lambda x: x.to_dict('records')).to_dict()

# Group datadictionaries by resource_id and convert to list of dicts
table_datadictionaries_grouped = table_datadictionaries.groupby('resource_id').apply(lambda x: x.to_dict('records')).to_dict()

# Remove rows where 'dataset_id' is None or an empty string
table_distributions = table_distributions[table_distributions['dataset_id'].notna() & (table_distributions['dataset_id'] != '')]

if not table_distributions.empty:
# Group distributions by dataset_id and convert to list of dicts
table_distributions_grouped = table_distributions.groupby('dataset_id' ).apply(lambda x: x.to_dict('records')).to_dict()
else:
logging.info(f"{log_module}:No distributions loaded. Check 'distribution.dataset_id' fields")
table_distributions_grouped = None

# Filter datadictionaries where resource_id is not empty or None
if 'resource_id' in table_datadictionaries.columns:
table_datadictionaries = table_datadictionaries[table_datadictionaries['resource_id'].notna() & (table_datadictionaries['resource_id'] != '')]

# Group datadictionaries by resource_id and convert to list of dicts
table_datadictionaries_grouped = table_datadictionaries.groupby('resource_id').apply(lambda x: x.to_dict('records')).to_dict()
else:
logging.info(f"{log_module}:No datadictionaries loaded. Check 'datadictionary.resource_id' fields.")
table_datadictionaries_grouped = None

# Add distributions and datadictionaries to each dataset object
table_data = [
{
**d,
'distributions': [
{**dr, 'datadictionaries': table_datadictionaries_grouped.get(dr['id'], [])}
{**dr, 'datadictionaries': table_datadictionaries_grouped.get(dr['id'], []) if table_datadictionaries_grouped else []}
for dr in table_distributions_grouped.get(
d.get('identifier') or d.get('alternate_identifier') or d.get('inspire_id'), []
)
Expand Down Expand Up @@ -194,11 +211,9 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
dataset.set_modified(modified_date)

# DCAT Type (dataset/series)
dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
is_series = False
if table_dataset.dcat_type and 'http' in table_dataset.dcat_type and 'series' in table_dataset.dcat_type:
is_series = True
dataset.set_resource_type(dcat_type['series' if is_series else 'default'])
dcat_type = getattr(table_dataset, 'dcat_type', OGC2CKAN_HARVESTER_MD_CONFIG['representation_type']['default'])
dcat_type = dcat_type.replace('https:', 'http:') if dcat_type else None
dataset.set_resource_type(dcat_type)

# Set SpatialRepresentationType
representation_type = getattr(table_dataset, 'representation_type', OGC2CKAN_HARVESTER_MD_CONFIG['representation_type']['default']).replace('https:', 'http:')
Expand Down Expand Up @@ -298,6 +313,14 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
metadata_profile = getattr(table_dataset, 'metadata_profile', OGC2CKAN_HARVESTER_MD_CONFIG['metadata_profile'])
dataset.set_metadata_profile(metadata_profile)

# Set graphic overview
graphic_overview = getattr(table_dataset, 'graphic_overview', None)
dataset.set_graphic_overview(graphic_overview)

# Set purpose
purpose = getattr(table_dataset, 'purpose', None)
dataset.set_purpose(purpose)

# Set Responsible Parties (Point of contact, Resource publisher and Resource contact/maintainer)
self.set_default_responsible_parties(dataset, self.default_dcat_info, ckan_info, table_dataset)

Expand All @@ -315,7 +338,8 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
dataset.set_license(ckan_info.default_license)

# Set distributions
self.get_distribution(ckan_info, dataset, distribution, datadictionary, datadictionaryfield, record, table_dataset)
if table_dataset.distributions:
self.get_distribution(ckan_info, dataset, distribution, datadictionary, datadictionaryfield, record, table_dataset)

# Metadata distributions (INSPIRE & GeoDCAT-AP)
self.set_metadata_distributions(ckan_info, dataset, distribution, record)
Expand All @@ -324,8 +348,10 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
keywords = []
keywords_uri = []
if hasattr(table_dataset, 'tag_string'):
for k in table_dataset.tag_string:
keyword_name = k.lower()
tag_string = table_dataset.tag_string
tag_string = [tag_string] if isinstance(tag_string, str) else tag_string
for k in tag_string:
keyword_name = k.lower()
if 'http' in keyword_name or '/' in keyword_name:
keyword_name = keyword_name.split('/')[-1]
keywords_uri.add(keyword_name)
Expand Down Expand Up @@ -353,7 +379,7 @@ def get_distribution(self, ckan_info: CKANInfo, dataset, distribution, datadicti
datadictionaries = []

for i, r in enumerate(table_dataset.distributions):
distribution_id = r.get('id', str(uuid.uuid4()))
distribution_id = self._normalize_id(r.get('id', str(uuid.uuid4())))
# Get data dictionaries
if r.datadictionaries:
self.get_datadictionary(datadictionary, datadictionaryfield, r.datadictionaries, distribution_id)
Expand Down Expand Up @@ -435,7 +461,7 @@ def _update_custom_formats(format, url=None, **args):
Returns:
str: The updated custom format.
"""
if any(string in format.lower() for string in ['esri', 'arcgis']) or 'viewer.html?url=' in url:
if isinstance(format, str) and (any(string in format.lower() for string in ['esri', 'arcgis']) or 'viewer.html?url=' in url):
format = 'HTML'

return format
Expand Down
4 changes: 3 additions & 1 deletion ogc2ckan/harvesters/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def get_metadata_records(self):
try:
metadata = MD_Metadata(etree.parse(md_record))
identifier = metadata.identifier
#TODO: Multilang also for CSW and OGC harvesters
#metadata.locales = ['es', 'en']
if identifier:
md_records[identifier] = metadata
except XmlError as e:
Expand Down Expand Up @@ -144,7 +146,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str):
dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
dataset.set_resource_type(
dcat_type['series'] if layer_info.hierarchy == "series"
else dcat_type['service'] if layer_info.hierarchy == "service"
else dcat_type['spatial_data_service'] if layer_info.hierarchy == "service"
else dcat_type['dataset'])

# Set SpatialRepresentationType
Expand Down
13 changes: 12 additions & 1 deletion ogc2ckan/mappings/default_ogc2ckan_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,19 @@
'dcat_type': {
'series': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/series',
'dataset': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset',
'service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service',
'spatial_data_service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service',
'default': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset',
'collection': 'http://purl.org/dc/dcmitype/Collection',
'event': 'http://purl.org/dc/dcmitype/Event',
'image': 'http://purl.org/dc/dcmitype/Image',
'still_image': 'http://purl.org/dc/dcmitype/StillImage',
'moving_image': 'http://purl.org/dc/dcmitype/MovingImage',
'physical_object': 'http://purl.org/dc/dcmitype/PhysicalObject',
'interactive_resource': 'http://purl.org/dc/dcmitype/InteractiveResource',
'service': 'http://purl.org/dc/dcmitype/Service',
'sound': 'http://purl.org/dc/dcmitype/Sound',
'software': 'http://purl.org/dc/dcmitype/Software',
'text': 'http://purl.org/dc/dcmitype/Text',
},
'encoding': 'UTF-8',
'frequency' : 'http://publications.europa.eu/resource/authority/frequency/UNKNOWN',
Expand Down
Loading

0 comments on commit 829801f

Please sign in to comment.