diff --git a/.env.example b/.env.example index c94c8f2..e3076df 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,9 @@ CKAN_URL=http://localhost:5000/ PYCSW_URL=http://localhost:8000/ +# PORTS +CKAN_OGC_DEV_PORT=5678 + # PATH APP_DIR=/app TZ=UTC diff --git a/ckan-ogc/Dockerfile.dev b/ckan-ogc/Dockerfile.dev index 44fb046..e1d4b08 100644 --- a/ckan-ogc/Dockerfile.dev +++ b/ckan-ogc/Dockerfile.dev @@ -11,6 +11,7 @@ ENV DEFAULT_LICENSE=http://creativecommons.org/licenses/by/4.0/ ENV DEFAULT_LICENSE_ID=cc-by ENV DEV_MODE=True ENV TIMEOUT=300 +ENV CKAN_OGC_DEV_PORT=5678 ENV SSL_UNVERIFIED_MODE=False RUN apt-get -q -y update && \ @@ -31,5 +32,5 @@ COPY ckan-ogc/docker-entrypoint.d/entrypoint_dev.sh entrypoint.sh EXPOSE 5678/TCP # Set entrypoint with debugpy -ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "./entrypoint_dev.sh"] +ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:${PYCSW_DEV_PORT}", "--wait-for-client", "./entrypoint_dev.sh"] CMD ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/ckan-ogc/Dockerfile.ghcr.dev b/ckan-ogc/Dockerfile.ghcr.dev index d5ba282..30778a5 100644 --- a/ckan-ogc/Dockerfile.ghcr.dev +++ b/ckan-ogc/Dockerfile.ghcr.dev @@ -11,6 +11,7 @@ ENV DEFAULT_LICENSE=http://creativecommons.org/licenses/by/4.0/ ENV DEFAULT_LICENSE_ID=cc-by ENV DEV_MODE=False ENV TIMEOUT=300 +ENV CKAN_OGC_DEV_PORT=5678 ENV SSL_UNVERIFIED_MODE=False WORKDIR ${APP_DIR} @@ -21,5 +22,5 @@ COPY ckan-ogc/docker-entrypoint.d/entrypoint_dev.sh entrypoint.sh EXPOSE 5678/TCP # Set entrypoint with debugpy -ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "./entrypoint_dev.sh"] +ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:${PYCSW_DEV_PORT}", "--wait-for-client", "./entrypoint_dev.sh"] CMD ["tail", "-f", "/dev/null"] \ No newline at end of file diff --git a/ckan-ogc/conf/config.yaml.template b/ckan-ogc/conf/config.yaml.template index 0c39c35..691a904 100644 --- a/ckan-ogc/conf/config.yaml.template +++ b/ckan-ogc/conf/config.yaml.template @@ -5,6 +5,7 @@ harvest_servers: name: 'Example OGC' groups: [] active: False + ckan_name_not_uuid: True type: 'ogc' organization: 'test' # If the org has a custom mappings: ckan-ogc/ogc2ckan/mappings/organizations/* @@ -18,11 +19,12 @@ harvest_servers: publisher_url: 'https://www.example.eu' publisher_type: 'http://purl.org/adms/publishertype/NationalAuthority' # Default URIs of metadata contact point and resource maintainer - maintainer_uri: 'https://www.example.eu/org/E05068001' - contact_name: 'Example Organization' - contact_email: 'info@example.eu' - contact_uri: 'https://www.example.eu/org/E05068001' - contact_url: 'https://www.example.eu' + maintainer_uri: 'https://example.eu/' + contact_uri: 'https://example.eu/' + contact_name: 'Example' + contact_email: 'info@example.es' + contact_url: 'https://example.eu/' + topic: 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/farming' lineage_process_steps: 'Spatial dataset generated from the original cartography provided by the competent national agency.' topic: 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/environment' theme_es: 'http://datos.gob.es/kos/sector-publico/sector/transporte' @@ -51,6 +53,7 @@ harvest_servers: groups: [] active: False type: 'csw' + ckan_name_not_uuid: False organization: 'test' custom_organization_active: False custom_organization_mapping_file: '' @@ -91,6 +94,7 @@ harvest_servers: name: 'XML Folder' groups: [] active: False + ckan_name_not_uuid: False type: 'xml' organization: 'test' custom_organization_active: False diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 090a064..0b90e0d 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -15,7 +15,7 @@ services: max-file: "10" ports: # debugpy - - "5678:5678" + - "${CKAN_OGC_DEV_PORT}:${CKAN_OGC_DEV_PORT}" volumes: - ./log:${APP_DIR}/log - ./metadata:${APP_DIR}/metadata diff --git a/ogc2ckan/config/ckan_config.py b/ogc2ckan/config/ckan_config.py index 549b200..1937a8a 100644 --- a/ogc2ckan/config/ckan_config.py +++ b/ogc2ckan/config/ckan_config.py @@ -4,10 +4,12 @@ import logging import requests import os +import ssl # third-party libraries import psycopg2 from bs4 import BeautifulSoup +import urllib.request # custom functions from config.ogc2ckan_config import get_log_module, load_yaml @@ -24,9 +26,9 @@ def __init__(self): self.default_license = os.environ.get('DEFAULT_LICENSE', OGC2CKAN_CKANINFO_CONFIG['default_license']) self.default_license_id = os.environ.get('DEFAULT_LICENSE_ID', OGC2CKAN_CKANINFO_CONFIG['default_license_id']) self.ckan_harvester = OGC2CKAN_HARVESTER_CONFIG - self.ssl_unverified_mode = os.environ.get('SSL_UNVERIFIED_MODE', OGC2CKAN_CKANINFO_CONFIG['ssl_unverified_mode']) - self.metadata_distributions = os.environ.get('METADATA_DISTRIBUTIONS', OGC2CKAN_CKANINFO_CONFIG['metadata_distributions']) - self.parallelization = os.environ.get('PARALLELIZATION', OGC2CKAN_CKANINFO_CONFIG['parallelization']) + self.ssl_unverified_mode = True if os.environ.get('SSL_UNVERIFIED_MODE') == 'True' else OGC2CKAN_CKANINFO_CONFIG['ssl_unverified_mode'] + self.metadata_distributions = True if os.environ.get('METADATA_DISTRIBUTIONS') == 'True' else OGC2CKAN_CKANINFO_CONFIG['metadata_distributions'] + self.parallelization = True if os.environ.get('PARALLELIZATION') == 'True' else OGC2CKAN_CKANINFO_CONFIG['parallelization'] self.dir3_soup = self.get_dir3_soup() self.ckan_dataset_schema = os.environ.get('CKAN_DATASET_SCHEMA', OGC2CKAN_CKANINFO_CONFIG['ckan_dataset_schema']) @@ -39,20 +41,43 @@ def get_dir3_soup(self): """ try: dir3_url = OGC2CKAN_CKANINFO_CONFIG['dir3_url'] - response = requests.get(dir3_url) - response.raise_for_status() # Check HTTP status code - self.dir3_soup = BeautifulSoup(response.text, 'html.parser') - except requests.exceptions.HTTPError as errh: - print("HTTP Error:", errh) + request = urllib.request.Request(dir3_url) + response = urllib.request.urlopen(request) + + #response = requests.get(dir3_url) + #response.raise_for_status() # Check HTTP status code + assert response.code == 200 + self.dir3_soup = BeautifulSoup(response.read(), 'html.parser') + + except ssl.CertificateError: + if self.ssl_unverified_mode == True or self.ssl_unverified_mode.lower() == 'true': + hostname = urllib.parse.urlparse(dir3_url).hostname + port = 443 # Assuming HTTPS (default port) + pem_cert = ssl.get_server_certificate((hostname, port)) + ssl_context = ssl.create_default_context(cadata=pem_cert) + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + + # Make the HTTPS request using the custom SSL context. + response = urllib.request.urlopen(request, context=ssl_context) + + assert response.code == 200 + self.dir3_soup = BeautifulSoup(response.read(), 'html.parser') + + else: + raise ssl.CertificateError(f"{log_module}:[INSECURE] Put SSL_UNVERIFIED_MODE=True if the host certificate is self-signed or invalid.") + + except requests.exceptions.HTTPError as e: + logging.error(f"{log_module}:HTTP Error getting 'dir3_soup' ({dir3_url}): {e}") self.dir3_soup = None - except requests.exceptions.ConnectionError as errc: - print("Error Connecting:", errc) + except requests.exceptions.ConnectionError as e: + logging.error(f"{log_module}:Error Connecting: 'dir3_soup' ({dir3_url}): {e}") self.dir3_soup = None - except requests.exceptions.Timeout as errt: - print("Timeout Error:", errt) + except requests.exceptions.Timeout as e: + logging.error(f"{log_module}:Timeout error: 'dir3_soup' ({dir3_url}): {e}") self.dir3_soup = None - except requests.exceptions.RequestException as err: - print("Something went wrong:", err) + except requests.exceptions.RequestException as e: + logging.error(f"{log_module}:Something went wrong: 'dir3_soup' ({dir3_url}): {e}") self.dir3_soup = None return self.dir3_soup diff --git a/ogc2ckan/controller/ckan_management.py b/ogc2ckan/controller/ckan_management.py index 3d3985c..5beb8f1 100644 --- a/ogc2ckan/controller/ckan_management.py +++ b/ogc2ckan/controller/ckan_management.py @@ -9,7 +9,7 @@ # third-party libraries import urllib.request -from pprint import pprint +from pprint import pprint, pformat # custom functions from config.ogc2ckan_config import get_log_module @@ -101,14 +101,14 @@ def create_ckan_datasets(ckan_site_url: str, authorization_key: str, datasets: o for dataset in datasets: try: if workspaces is not None and not any(x.lower() in dataset.ogc_workspace.lower() for x in workspaces): - continue + break data = dataset.generate_data() - if data is None: + if data is not None: create_ckan_dataset(ckan_site_url, ssl_unverified_mode, data, authorization_key) ckan_dataset_count += 1 except Exception as e: - print(f"\nckan_site_url: {ckan_site_url}\nERROR: {e}\nWhile trying to create: {dataset.name} | {dataset.title}\n{pprint.pformat(dataset.dataset_dict())}\n", file=sys.stderr) + print(f"\nckan_site_url: {ckan_site_url}\nERROR: {e}\nWhile trying to create: {dataset.name} | {dataset.title}\n{pformat(dataset.dataset_dict())}\n", file=sys.stderr) error_dict = {'title': dataset.title, 'error': str(e)} if hasattr(dataset, 'inspire_id') and dataset.inspire_id: error_dict['inspire_id'] = dataset.inspire_id @@ -310,18 +310,21 @@ def get_ckan_datasets_list(ckan_site_url: str, ssl_unverified_mode: bool, author # We'll use the package_search function to list all datasets with fields as need. url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list'].format(fields=fields, rows=rows, include_private=include_private) response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True) - results = response['result']['results'] - count = response['result']['count'] - # if response['result']['count'] > rows then we need to paginate the results. - if count > rows: - # Calculate the number of pages we need to paginate through. - pages = count // rows + 1 - # Paginate through the results. - for page in range(2, pages + 1): - url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list_paginate'].format(fields=fields, rows=rows, include_private=include_private, start=rows * (page - 1)) - response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True) - results += response['result']['results'] - + if response is not None: + results = response['result']['results'] + count = response['result']['count'] + # if response['result']['count'] > rows then we need to paginate the results. + if count > rows: + # Calculate the number of pages we need to paginate through. + pages = count // rows + 1 + # Paginate through the results. + for page in range(2, pages + 1): + url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list_paginate'].format(fields=fields, rows=rows, include_private=include_private, start=rows * (page - 1)) + response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True) + results += response['result']['results'] + else: + results = [] + return results def get_ckan_dataset_info(ckan_site_url: str, ssl_unverified_mode: bool, authorization_key: Optional[str] = None, field: str = 'id', field_value: Optional[str] = None) -> None: diff --git a/ogc2ckan/harvesters/base.py b/ogc2ckan/harvesters/base.py index 1bc11a2..a14fc82 100644 --- a/ogc2ckan/harvesters/base.py +++ b/ogc2ckan/harvesters/base.py @@ -83,7 +83,7 @@ class Harvester: get_all_datasets(self, ckan_info): Gets all datasets from the server. ''' - def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, default_dcat_info): + def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, default_dcat_info): self.app_dir = app_dir self.url = url self.name = name @@ -97,6 +97,7 @@ def __init__(self, app_dir, url, name, groups, active, organization, type, custo self.default_dcat_info = DCATInfo(default_dcat_info) if default_dcat_info else None self.default_keywords = default_keywords self.default_inspire_info = default_inspire_info + self.ckan_name_not_uuid = ckan_name_not_uuid or False self.datasets = [] self.datadictionaries = [] self.ckan_dataset_count = 0 @@ -182,7 +183,7 @@ def get_dataset_common_elements(self, record: str, ckan_dataset_schema: str) -> - dataset (object): The CKAN dataset class based on the schema. - distribution (object): The CKAN distribution class based on the schema. - uuid_identifier (str): A UUID identifier for the dataset. - - ckan_name (str): The CKAN name for the dataset, based on the UUID identifier and organization. + - ckan_name (str): The CKAN name for the dataset, based on the UUID or the identifier and organization. - ckan_groups (list): A list of CKAN groups for the dataset. - inspire_id (str): The INSPIRE ID for the dataset. """ @@ -199,7 +200,13 @@ def get_dataset_common_elements(self, record: str, ckan_dataset_schema: str) -> datadictionaryfield = schema ["datadictionaryfield"] uuid_identifier = self._create_uuid_identifier() - ckan_name = uuid_identifier + + # Use ckan_name instead of uuid_identifier if required + if self.ckan_name_not_uuid: + ckan_name = self._get_ckan_name(record, self.organization) + uuid_identifier = ckan_name + else: + ckan_name = uuid_identifier ckan_groups = [{'name': g.lower()} for g in self.groups or []] @@ -501,6 +508,7 @@ def _create_harvester_from_server(harvest_server, harvester_class): private_datasets=harvest_server.private_datasets, default_keywords=harvest_server.default_keywords, default_inspire_info=harvest_server.default_inspire_info, + ckan_name_not_uuid=harvest_server.ckan_name_not_uuid, **harvest_server.default_dcat_info ) diff --git a/ogc2ckan/harvesters/csw.py b/ogc2ckan/harvesters/csw.py index 3b4f71f..ee184f6 100644 --- a/ogc2ckan/harvesters/csw.py +++ b/ogc2ckan/harvesters/csw.py @@ -35,7 +35,7 @@ def __init__(self, **entries): setattr(self, key, value) class HarvesterCSW(Harvester): - def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, constraints, **default_dcat_info): + def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, constraints, **default_dcat_info): super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info) self.constraints = constraints self.csw = None @@ -278,10 +278,10 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): dataset.set_spatial_uri(self.get_custom_metadata_value(custom_metadata, 'spatial_uri')) # Set temporal coverage - if layer_info.identification.temporalextent_end and layer_info.identification.temporalextent_start: + try: dataset.set_temporal_start(layer_info.identification.temporalextent_start) dataset.set_temporal_end(layer_info.identification.temporalextent_end) - else: + except AttributeError: dataset.set_temporal_start(self.get_custom_metadata_value(custom_metadata, 'temporal_start')) dataset.set_temporal_end(self.get_custom_metadata_value(custom_metadata, 'temporal_end')) diff --git a/ogc2ckan/harvesters/ogc.py b/ogc2ckan/harvesters/ogc.py index 9b22721..79fd1e6 100644 --- a/ogc2ckan/harvesters/ogc.py +++ b/ogc2ckan/harvesters/ogc.py @@ -19,8 +19,8 @@ class HarvesterOGC(Harvester): - def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, workspaces, constraints=None, **default_dcat_info): - super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info) + def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, workspaces, constraints=None, **default_dcat_info): + super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, **default_dcat_info) self.workspaces = workspaces self.constraints = constraints self.wms = None @@ -113,27 +113,25 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): dataset, distribution, datadictionary, datadictionaryfield, uuid_identifier, ckan_name, ckan_groups, inspire_id = \ self.get_dataset_common_elements(record, ckan_info.ckan_dataset_schema) - # CONNECT + # layer_info for wfs/wcs if service_type == 'wfs': - wfs = self.connect_wfs() - layer_info = self.wfs.contents[record] + layer_info = self.wfs.contents.get(record) + wms_name = record + wms_layer_info = self.wms.contents.get(wms_name) + wmts_layer_info = self.wmts.contents.get(wms_name) elif service_type == 'wcs': - wcs = self.connect_wcs() - layer_info = self.wcs.contents[record] - - wms = self.connect_wms() - wmts = self.connect_wmts() + layer_info = self.wcs.contents.get(record) + wms_name = record.replace("__", ":") + wms_layer_info = self.wms.contents.get(wms_name) + wmts_layer_info = self.wmts.contents.get(wms_name) # Search if custom organization info exists for the dataset custom_metadata = None if self.custom_organization_active: - custom_metadata = self.get_custom_default_metadata(layer_info.id.split(':')[1]) + custom_metadata = self.get_custom_default_metadata(wms_name.split(':')[1]) # OGC Workspace and OGC services info - ogc_workspace = layer_info.id.split(':')[0] if layer_info.id else None - wms_layer_info = wms.contents.get(record) - wmts_layer_info = wmts.contents.get(record) - json_info = wfs.contents.get(record) + ogc_workspace = wms_name.split(':')[0] if wms_name else None # Set basic info of MD dataset = dataset(uuid_identifier, ckan_name, self.organization, ckan_info.default_license_id) @@ -144,7 +142,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): dataset.set_private(private) # Set alternate identifier (layer name) - alternate_identifier = layer_info.id if layer_info.id else None + alternate_identifier = wms_name if wms_name else None dataset.set_alternate_identifier(alternate_identifier) # Title @@ -262,16 +260,16 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): self.set_default_responsible_parties(dataset, self.default_dcat_info, ckan_info) # Overwrite Point of contact (Metadata) and Responsible Party (Resource) from OGC Info - if wms.provider: - contact_name = wms.provider.contact.name if wms.provider.contact.name is not None else wms.provider.contact.organization + if self.wms.provider: + contact_name = self.wms.provider.contact.name if self.wms.provider.contact.name is not None else self.wms.provider.contact.organization dataset.set_contact_name(contact_name) - dataset.set_contact_email(wms.provider.contact.email.lower()) + dataset.set_contact_email(self.wms.provider.contact.email.lower()) # Set license dataset.set_license(ckan_info.default_license) # Set distributions - self.get_distribution(ckan_info, dataset, distribution, record, service_type, layer_info, wms_layer_info, wmts_layer_info, json_info) + self.get_distribution(ckan_info, dataset, distribution, record, service_type, layer_info, wms_layer_info, wmts_layer_info) # Metadata distributions (INSPIRE & GeoDCAT-AP) self.set_metadata_distributions(ckan_info, dataset, distribution, record) @@ -281,7 +279,7 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): return dataset - def get_distribution(self, ckan_info: CKANInfo, dataset, distribution, record: str, service_type: str, layer_info,wms_layer_info, wmts_layer_info, json_info): + def get_distribution(self, ckan_info: CKANInfo, dataset, distribution, record: str, service_type: str, layer_info, wms_layer_info, wmts_layer_info): # Add distributions (WMS, WFS, WMTS & GeoJSON) dataset.set_distributions([]) @@ -306,17 +304,16 @@ def add_distribution(distribution, dist_info): dist_info = self._get_distribution_info("WFS", wfs_url, self.localized_strings_dict['distributions']['wfs'], ckan_info.default_license, ckan_info.default_license_id, dataset.access_rights, dataset.language) add_distribution(distribution, dist_info) - # WCS - if service_type == "wcs": - wcs_url = f"{self.get_wcs_url()}&request=GetCapabilities#{record}" - dist_info = self._get_distribution_info("WCS", wcs_url, self.localized_strings_dict['distributions']['wcs'], ckan_info.default_license, ckan_info.default_license_id, dataset.access_rights, dataset.language) - add_distribution(distribution, dist_info) - - # GeoJSON - if json_info is not None: + # GeoJSON layer_id_parts = layer_info.id.split(':') workspace = layer_id_parts[0] layername = layer_id_parts[1] json_url = f"{self.get_wfs_url().replace('geoserver/ows', f'geoserver/{workspace.lower()}/ows')}&version=1.0.0&request=GetFeature&typeName={layername.lower()}&outputFormat=application/json&maxFeatures=100" - dist_info = self._get_distribution_info("GeoJSON", json_url, self.localized_strings_dict['distributions']['geojson'], ckan_info.default_license, ckan_info.default_license_id, dataset.access_rights, dataset.language) + dist_info_json = self._get_distribution_info("GeoJSON", json_url, self.localized_strings_dict['distributions']['geojson'], ckan_info.default_license, ckan_info.default_license_id, dataset.access_rights, dataset.language) + add_distribution(distribution, dist_info_json) + + # WCS + if service_type == "wcs": + wcs_url = f"{self.get_wcs_url()}&request=GetCapabilities#{record}" + dist_info = self._get_distribution_info("WCS", wcs_url, self.localized_strings_dict['distributions']['wcs'], ckan_info.default_license, ckan_info.default_license_id, dataset.access_rights, dataset.language) add_distribution(distribution, dist_info) diff --git a/ogc2ckan/harvesters/table.py b/ogc2ckan/harvesters/table.py index 24c86aa..79f016e 100644 --- a/ogc2ckan/harvesters/table.py +++ b/ogc2ckan/harvesters/table.py @@ -48,8 +48,8 @@ def get(self, key, default=None): return getattr(self, key, default) class HarvesterTable(Harvester): - def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info): - super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info) + def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, **default_dcat_info): + super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, **default_dcat_info) self.file_extension = Path(self.url).suffix[1:] self.table_data = [] self.datadictionaries = [] diff --git a/ogc2ckan/harvesters/xml.py b/ogc2ckan/harvesters/xml.py index c42697b..0242045 100644 --- a/ogc2ckan/harvesters/xml.py +++ b/ogc2ckan/harvesters/xml.py @@ -25,8 +25,8 @@ class XmlError(Exception): pass class HarvesterXML(Harvester): - def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, constraints, **default_dcat_info): - super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info) + def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, constraints, **default_dcat_info): + super().__init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, **default_dcat_info) self.md_records = None self.folder_path = None self.formats = OGC2CKAN_HARVESTER_CONFIG['xml']['formats'] @@ -191,10 +191,10 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str): dataset.set_spatial_uri(self.get_custom_metadata_value(custom_metadata, 'spatial_uri')) # Set temporal coverage - if layer_info.identification.temporalextent_end and layer_info.identification.temporalextent_start: + try: dataset.set_temporal_start(layer_info.identification.temporalextent_start) dataset.set_temporal_end(layer_info.identification.temporalextent_end) - else: + except AttributeError: dataset.set_temporal_start(self.get_custom_metadata_value(custom_metadata, 'temporal_start')) dataset.set_temporal_end(self.get_custom_metadata_value(custom_metadata, 'temporal_end')) diff --git a/ogc2ckan/model/harvest_schema.py b/ogc2ckan/model/harvest_schema.py index 05c4086..edff3b6 100644 --- a/ogc2ckan/model/harvest_schema.py +++ b/ogc2ckan/model/harvest_schema.py @@ -30,6 +30,7 @@ class HarvesterSchema: "type": "string", "enum": [value['type'] for key, value in OGC2CKAN_HARVESTER_CONFIG.items()] }, + "ckan_name_not_uuid": {"type": "boolean"}, "organization": {"type": "string"}, "custom_organization_active": {"type": "boolean"}, "custom_organization_mapping_file": {"type": "string"}, @@ -76,7 +77,7 @@ class HarvesterSchema: "required": ["inspireid_theme", "inspireid_nutscode", "inspireid_versionid"] }, }, - "required": ["url", "name", "groups", "active", "type", "organization", "custom_organization_active", "custom_organization_mapping_file", "private_datasets", "default_dcat_info","default_keywords", "default_inspire_info"] + "required": ["url", "name", "groups", "active", "type", "ckan_name_not_uuid", "organization", "custom_organization_active", "custom_organization_mapping_file", "private_datasets", "default_dcat_info","default_keywords", "default_inspire_info"] } class HarvesterSchemaCSW(HarvesterSchema): diff --git a/ogc2ckan/ogc2ckan.py b/ogc2ckan/ogc2ckan.py index 2e7f2f6..7dfd1ff 100644 --- a/ogc2ckan/ogc2ckan.py +++ b/ogc2ckan/ogc2ckan.py @@ -23,6 +23,7 @@ TZ = os.environ.get("TZ", "TZ") DEV_MODE = None VERSION = os.environ.get("VERSION", "0.1") +CKAN_OGC_DEV_PORT = os.environ.get("CKAN_OGC_DEV_PORT", 5678) APP_DIR = os.environ.get("APP_DIR", "/app") config_file = os.path.abspath(APP_DIR + "/config.yaml") log_module = "[ogc2ckan]" @@ -116,15 +117,17 @@ def start_harvesting(config_file): logging.info(f"{log_module}:CKAN_URL: {ckan_info.ckan_site_url}") try: - if harvest_servers is not None: - if ckan_info.parallelization is True: - #TODO: Fix multicore parallel processing - parallel_count = Parallel(n_jobs=processes)(delayed(launch_harvest)(harvest_server=endpoint, ckan_info=ckan_info) for endpoint in harvest_servers) - new_records.append(sum(i[0] for i in parallel_count)) - else: - for endpoint in harvest_servers: - harvester = launch_harvest(harvest_server=endpoint, ckan_info=ckan_info) - new_records.append(harvester.ckan_dataset_count) + if harvest_servers is not None and ckan_info.parallelization is True: + #TODO: Fix multicore parallel processing + logging.warning(f'{log_module}:Parallel processing is not implemented yet.') + ''' + parallel_count = Parallel(n_jobs=processes)(delayed(launch_harvest)(harvest_server=endpoint, ckan_info=ckan_info) for endpoint in harvest_servers) + new_records.append(sum(i[0] for i in parallel_count)) + ''' + elif harvest_servers and ckan_info.parallelization is False: + for endpoint in harvest_servers: + harvester = launch_harvest(harvest_server=endpoint, ckan_info=ckan_info) + new_records.append(harvester.ckan_dataset_count) except Exception as e: logging.error(f"{log_module}:Check invalid 'type' and 'active: True' in 'harvest_servers/{{my-harvest-server}}'at {config_file} Error: {e}") new_records = 0 @@ -154,7 +157,7 @@ def main(): if __name__ == "__main__": if DEV_MODE == True or DEV_MODE == "True": # Allow other computers to attach to ptvsd at this IP address and port. - ptvsd.enable_attach(address=("0.0.0.0", 5678), redirect_output=True) + ptvsd.enable_attach(address=("0.0.0.0", CKAN_OGC_DEV_PORT), redirect_output=True) # Pause the program until a remote debugger is attached ptvsd.wait_for_attach()