Merge pull request #26 from mjanez/main

Update latest branch
mjanez · Feb 14, 2024 · 829801f · 829801f
2 parents eed6020 + dcb0bdc
commit 829801f
Show file tree

Hide file tree

Showing 9 changed files with 134 additions and 69 deletions.
diff --git a/ogc2ckan/ckan_datasets/base.py b/ogc2ckan/ckan_datasets/base.py
@@ -107,6 +107,7 @@ def __init__(self, ckan_id, name, owner_org, license_id):
         self.owner_org = owner_org
         self.private = False
         self.groups = []
+        self.graphic_overview = None
         # use http://<ckan_url>/api/action/organization_list to see the organization ids in your CKAN site
         self.license_id = license_id
         self.identifier = None
@@ -142,6 +143,9 @@ def set_private(self, private):
     def set_groups(self, groups):
         self.groups = groups
 
+    def set_graphic_overview(self, graphic_overview):
+        self.graphic_overview = graphic_overview
+
     def set_publisher_uri(self, publisher_uri):
         self.publisher_uri = publisher_uri
 
@@ -351,6 +355,7 @@ def dataset_dict(self):
             'owner_org': self.owner_org,
             'private': self.private,
             'groups': self.groups,
+            'graphic_overview': self.graphic_overview,
             'title': self.title,
             'notes': self.notes,
             'license_id': self.license_id,
@@ -448,6 +453,7 @@ def dataset_dict(self):
             'owner_org': self.owner_org,
             'private': self.private,
             'groups': self.groups,
+            'graphic_overview': self.graphic_overview,
             'title_translated': self.title_translated,
             'notes_translated': self.notes_translated,
             'license_id': self.license_id,

diff --git a/ogc2ckan/ckan_datasets/geodcatap.py b/ogc2ckan/ckan_datasets/geodcatap.py
@@ -108,6 +108,7 @@ def __init__(self, ckan_id, name, owner_org, license_id):
         self.owner_org = owner_org
         self.private = False
         self.groups = []
+        self.graphic_overview = None
         # use http://<ckan_url>/api/action/organization_list to see the organization ids in your CKAN site
         self.license_id = license_id
         self.identifier = ckan_id
@@ -181,6 +182,9 @@ def set_private(self, private):
     def set_groups(self, groups):
         self.groups = groups
 
+    def set_graphic_overview(self, graphic_overview):
+        self.graphic_overview = graphic_overview
+
     def set_publisher_uri(self, publisher_uri):
         self.publisher_uri = publisher_uri
 
@@ -402,6 +406,7 @@ def dataset_dict(self):
             'owner_org': self.owner_org,
             'private': self.private,
             'groups': self.groups,
+            'graphic_overview': self.graphic_overview,
             'title': self.title,
             'notes': self.notes,
             'license_id': self.license_id,
@@ -505,6 +510,7 @@ def dataset_dict_multilang(self):
             'owner_org': self.owner_org,
             'private': self.private,
             'groups': self.groups,
+            'graphic_overview': self.graphic_overview,
             'title_translated': self.title_translated,
             'notes_translated': self.notes_translated,
             'license_id': self.license_id,

diff --git a/ogc2ckan/harvesters/base.py b/ogc2ckan/harvesters/base.py
@@ -559,6 +559,20 @@ def _get_ckan_name(name, organization):
 
         return ckan_name
 
+    @staticmethod
+    def _normalize_id(id):
+        # the id of a CKAN dataset, must be between 2 and 36 characters long and contain only lowercase
+        # alphanumeric characters, - and _, e.g. 'warandpeace'
+        normal = unicodedata.normalize('NFKD', id).encode('ASCII', 'ignore').decode('utf-8')
+        ckan_name = re.sub(r'[^a-z0-9_-]', '_', normal.lower())[:36]
+        if len(ckan_name) == 0:
+            ckan_name = 'unnamed'
+        elif ckan_name[0].isdigit():
+            ckan_name = 'n' + ckan_name[1:]
+        ckan_name = ckan_name.lower()
+
+        return ckan_name
+
     @staticmethod
     def _normalize_date(date):
         if isinstance(date, str):
@@ -639,7 +653,7 @@ def _get_ckan_format(dist_info):
                 informat = ''.join(str(value) for value in dist_info.values()).lower()
                 informat = next((key for key in OGC2CKAN_MD_FORMATS if key.lower() in informat), dist_info.get('url', '').lower())
             except:
-                informat = dist_info['url'].lower()
+                informat = dist_info['url'].lower() if isinstance(dist_info['url'], str) else dist_info['url']
 
         return OGC2CKAN_MD_FORMATS.get(informat, (None, None, None, None))
 
@@ -928,7 +942,7 @@ def set_translated_fields(self, dataset, source_data: object, source_language=No
         source_language = source_language if "http" in source_language else None
         default_language = source_language if source_language is not None and source_language != self.default_language else self.default_language
 
-        required_lang = get_mapping_value(default_language, 'language', 'iso_639_2')
+        required_lang = get_mapping_value(default_language, 'language', 'iso_639_1')
 
         for field, field_translated in OGC2CKAN_MD_MULTILANG_FIELDS.items():
             output = {}

diff --git a/ogc2ckan/harvesters/csw.py b/ogc2ckan/harvesters/csw.py
@@ -231,7 +231,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str):
         dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
         dataset.set_resource_type(
             dcat_type['series'] if layer_info.hierarchy == "series"
-            else dcat_type['service'] if layer_info.hierarchy == "service"
+            else dcat_type['spatial_data_service'] if layer_info.hierarchy == "service"
             else dcat_type['dataset'])
 
         # Set SpatialRepresentationType

diff --git a/ogc2ckan/harvesters/table.py b/ogc2ckan/harvesters/table.py
@@ -75,6 +75,9 @@ def get_file_by_extension(self, harvester_formats):
 
                 logging.info(f"{log_module}:Load '{self.file_extension.upper()}' file: '{filename}' with {len(table_data)} records") 
 
+                # Clean column names by removing leading/trailing whitespaces, newlines, and tabs
+                table_data.columns = table_data.columns.str.strip().str.replace('\n', '').str.replace('\t', '')
+
                 # Remove all fields that are a nan float and trim all spaces of the values
                 table_data = table_data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
                 table_data = table_data.fillna(value='')
@@ -90,18 +93,32 @@ def get_file_by_extension(self, harvester_formats):
                 table_distributions = table_distributions.rename(columns=lambda x: x.replace('resource_', ''))
                 table_datadictionaries = table_datadictionaries.rename(columns=lambda x: re.sub(re.compile(r'datadictionary(_info)?_'), '', x).replace('info.', ''))
 
-                # Group distributions by dataset_id and convert to list of dicts
-                table_distributions_grouped = table_distributions.groupby('dataset_id').apply(lambda x: x.to_dict('records')).to_dict()
-
-                # Group datadictionaries by resource_id and convert to list of dicts
-                table_datadictionaries_grouped = table_datadictionaries.groupby('resource_id').apply(lambda x: x.to_dict('records')).to_dict()
-
+                # Remove rows where 'dataset_id' is None or an empty string
+                table_distributions = table_distributions[table_distributions['dataset_id'].notna() & (table_distributions['dataset_id'] != '')]
+
+                if not table_distributions.empty:
+                    # Group distributions by dataset_id and convert to list of dicts
+                    table_distributions_grouped = table_distributions.groupby('dataset_id' ).apply(lambda x: x.to_dict('records')).to_dict()
+                else:
+                    logging.info(f"{log_module}:No distributions loaded. Check 'distribution.dataset_id' fields")
+                    table_distributions_grouped = None
+
+                # Filter datadictionaries where resource_id is not empty or None
+                if 'resource_id' in table_datadictionaries.columns:
+                    table_datadictionaries = table_datadictionaries[table_datadictionaries['resource_id'].notna() & (table_datadictionaries['resource_id'] != '')]
+
+                    # Group datadictionaries by resource_id and convert to list of dicts
+                    table_datadictionaries_grouped = table_datadictionaries.groupby('resource_id').apply(lambda x: x.to_dict('records')).to_dict()
+                else:
+                    logging.info(f"{log_module}:No datadictionaries loaded. Check 'datadictionary.resource_id' fields.")
+                    table_datadictionaries_grouped = None
+
                 # Add distributions and datadictionaries to each dataset object
                 table_data = [
                     {
                         **d,
                         'distributions': [
-                            {**dr, 'datadictionaries': table_datadictionaries_grouped.get(dr['id'], [])}
+                            {**dr, 'datadictionaries': table_datadictionaries_grouped.get(dr['id'], []) if table_datadictionaries_grouped else []}
                             for dr in table_distributions_grouped.get(
                                 d.get('identifier') or d.get('alternate_identifier') or d.get('inspire_id'), []
                             )
@@ -194,11 +211,9 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
         dataset.set_modified(modified_date)
 
         # DCAT Type (dataset/series)
-        dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
-        is_series = False
-        if table_dataset.dcat_type and 'http' in table_dataset.dcat_type and 'series' in table_dataset.dcat_type:
-            is_series = True
-        dataset.set_resource_type(dcat_type['series' if is_series else 'default'])
+        dcat_type = getattr(table_dataset, 'dcat_type', OGC2CKAN_HARVESTER_MD_CONFIG['representation_type']['default'])
+        dcat_type = dcat_type.replace('https:', 'http:') if dcat_type else None
+        dataset.set_resource_type(dcat_type)
 
         # Set SpatialRepresentationType
         representation_type = getattr(table_dataset, 'representation_type', OGC2CKAN_HARVESTER_MD_CONFIG['representation_type']['default']).replace('https:', 'http:')
@@ -298,6 +313,14 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
         metadata_profile = getattr(table_dataset, 'metadata_profile', OGC2CKAN_HARVESTER_MD_CONFIG['metadata_profile'])
         dataset.set_metadata_profile(metadata_profile)
 
+        # Set graphic overview
+        graphic_overview = getattr(table_dataset, 'graphic_overview', None)
+        dataset.set_graphic_overview(graphic_overview)
+
+        # Set purpose
+        purpose = getattr(table_dataset, 'purpose', None)
+        dataset.set_purpose(purpose)
+
         # Set Responsible Parties (Point of contact, Resource publisher and Resource contact/maintainer)
         self.set_default_responsible_parties(dataset, self.default_dcat_info, ckan_info, table_dataset)
 
@@ -315,7 +338,8 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
         dataset.set_license(ckan_info.default_license)
 
         # Set distributions
-        self.get_distribution(ckan_info, dataset, distribution, datadictionary, datadictionaryfield, record, table_dataset)
+        if table_dataset.distributions:
+            self.get_distribution(ckan_info, dataset, distribution, datadictionary, datadictionaryfield, record, table_dataset)
 
         # Metadata distributions (INSPIRE & GeoDCAT-AP)
         self.set_metadata_distributions(ckan_info, dataset, distribution, record)
@@ -324,8 +348,10 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, table_dataset: object =
         keywords = []
         keywords_uri = []
         if hasattr(table_dataset, 'tag_string'):
-            for k in table_dataset.tag_string:
-                keyword_name = k.lower()                
+            tag_string = table_dataset.tag_string
+            tag_string = [tag_string] if isinstance(tag_string, str) else tag_string
+            for k in tag_string:
+                keyword_name = k.lower()
                 if 'http' in keyword_name or '/' in keyword_name:
                     keyword_name = keyword_name.split('/')[-1]
                     keywords_uri.add(keyword_name)
@@ -353,7 +379,7 @@ def get_distribution(self, ckan_info: CKANInfo, dataset, distribution, datadicti
         datadictionaries = []
 
         for i, r in enumerate(table_dataset.distributions):
-            distribution_id = r.get('id', str(uuid.uuid4()))
+            distribution_id = self._normalize_id(r.get('id', str(uuid.uuid4())))
             # Get data dictionaries
             if r.datadictionaries:
                 self.get_datadictionary(datadictionary, datadictionaryfield, r.datadictionaries, distribution_id)
@@ -435,7 +461,7 @@ def _update_custom_formats(format, url=None, **args):
         Returns:
             str: The updated custom format.
         """
-        if any(string in format.lower() for string in ['esri', 'arcgis']) or 'viewer.html?url=' in url:
+        if isinstance(format, str) and (any(string in format.lower() for string in ['esri', 'arcgis']) or 'viewer.html?url=' in url):
             format = 'HTML'
 
         return format

diff --git a/ogc2ckan/harvesters/xml.py b/ogc2ckan/harvesters/xml.py
@@ -72,6 +72,8 @@ def get_metadata_records(self):
             try:
                 metadata = MD_Metadata(etree.parse(md_record))
                 identifier = metadata.identifier
+                #TODO: Multilang also for CSW and OGC harvesters
+                #metadata.locales = ['es', 'en']
                 if identifier:
                     md_records[identifier] = metadata
             except XmlError as e:
@@ -144,7 +146,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str):
         dcat_type = OGC2CKAN_HARVESTER_MD_CONFIG['dcat_type']
         dataset.set_resource_type(
             dcat_type['series'] if layer_info.hierarchy == "series"
-            else dcat_type['service'] if layer_info.hierarchy == "service"
+            else dcat_type['spatial_data_service'] if layer_info.hierarchy == "service"
             else dcat_type['dataset'])
 
         # Set SpatialRepresentationType

diff --git a/ogc2ckan/mappings/default_ogc2ckan_config.py b/ogc2ckan/mappings/default_ogc2ckan_config.py
@@ -98,8 +98,19 @@
     'dcat_type': {
         'series': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/series',
         'dataset': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset',
-        'service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service',
+        'spatial_data_service': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/service',
         'default': 'http://inspire.ec.europa.eu/metadata-codelist/ResourceType/dataset',
+        'collection': 'http://purl.org/dc/dcmitype/Collection',
+        'event': 'http://purl.org/dc/dcmitype/Event',
+        'image': 'http://purl.org/dc/dcmitype/Image',
+        'still_image': 'http://purl.org/dc/dcmitype/StillImage',
+        'moving_image': 'http://purl.org/dc/dcmitype/MovingImage',
+        'physical_object': 'http://purl.org/dc/dcmitype/PhysicalObject',
+        'interactive_resource': 'http://purl.org/dc/dcmitype/InteractiveResource',
+        'service': 'http://purl.org/dc/dcmitype/Service',
+        'sound': 'http://purl.org/dc/dcmitype/Sound',
+        'software': 'http://purl.org/dc/dcmitype/Software',
+        'text': 'http://purl.org/dc/dcmitype/Text',
     },
     'encoding': 'UTF-8',
     'frequency' : 'http://publications.europa.eu/resource/authority/frequency/UNKNOWN',