Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update latest #21

Merged
merged 10 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
CKAN_URL=http://localhost:5000/
PYCSW_URL=http://localhost:8000/

# PORTS
CKAN_OGC_DEV_PORT=5678

# PATH
APP_DIR=/app
TZ=UTC
Expand Down
3 changes: 2 additions & 1 deletion ckan-ogc/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ENV DEFAULT_LICENSE=http://creativecommons.org/licenses/by/4.0/
ENV DEFAULT_LICENSE_ID=cc-by
ENV DEV_MODE=True
ENV TIMEOUT=300
ENV CKAN_OGC_DEV_PORT=5678
ENV SSL_UNVERIFIED_MODE=False

RUN apt-get -q -y update && \
Expand All @@ -31,5 +32,5 @@ COPY ckan-ogc/docker-entrypoint.d/entrypoint_dev.sh entrypoint.sh
EXPOSE 5678/TCP

# Set entrypoint with debugpy
ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "./entrypoint_dev.sh"]
ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:${PYCSW_DEV_PORT}", "--wait-for-client", "./entrypoint_dev.sh"]
CMD ["tail", "-f", "/dev/null"]
3 changes: 2 additions & 1 deletion ckan-ogc/Dockerfile.ghcr.dev
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ENV DEFAULT_LICENSE=http://creativecommons.org/licenses/by/4.0/
ENV DEFAULT_LICENSE_ID=cc-by
ENV DEV_MODE=False
ENV TIMEOUT=300
ENV CKAN_OGC_DEV_PORT=5678
ENV SSL_UNVERIFIED_MODE=False

WORKDIR ${APP_DIR}
Expand All @@ -21,5 +22,5 @@ COPY ckan-ogc/docker-entrypoint.d/entrypoint_dev.sh entrypoint.sh
EXPOSE 5678/TCP

# Set entrypoint with debugpy
ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "./entrypoint_dev.sh"]
ENTRYPOINT ["python3", "-m", "debugpy", "--listen", "0.0.0.0:${PYCSW_DEV_PORT}", "--wait-for-client", "./entrypoint_dev.sh"]
CMD ["tail", "-f", "/dev/null"]
14 changes: 9 additions & 5 deletions ckan-ogc/conf/config.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ harvest_servers:
name: 'Example OGC'
groups: []
active: False
ckan_name_not_uuid: True
type: 'ogc'
organization: 'test'
# If the org has a custom mappings: ckan-ogc/ogc2ckan/mappings/organizations/*
Expand All @@ -18,11 +19,12 @@ harvest_servers:
publisher_url: 'https://www.example.eu'
publisher_type: 'http://purl.org/adms/publishertype/NationalAuthority'
# Default URIs of metadata contact point and resource maintainer
maintainer_uri: 'https://www.example.eu/org/E05068001'
contact_name: 'Example Organization'
contact_email: 'info@example.eu'
contact_uri: 'https://www.example.eu/org/E05068001'
contact_url: 'https://www.example.eu'
maintainer_uri: 'https://example.eu/'
contact_uri: 'https://example.eu/'
contact_name: 'Example'
contact_email: 'info@example.es'
contact_url: 'https://example.eu/'
topic: 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/farming'
lineage_process_steps: 'Spatial dataset generated from the original cartography provided by the competent national agency.'
topic: 'http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/environment'
theme_es: 'http://datos.gob.es/kos/sector-publico/sector/transporte'
Expand Down Expand Up @@ -51,6 +53,7 @@ harvest_servers:
groups: []
active: False
type: 'csw'
ckan_name_not_uuid: False
organization: 'test'
custom_organization_active: False
custom_organization_mapping_file: ''
Expand Down Expand Up @@ -91,6 +94,7 @@ harvest_servers:
name: 'XML Folder'
groups: []
active: False
ckan_name_not_uuid: False
type: 'xml'
organization: 'test'
custom_organization_active: False
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ services:
max-file: "10"
ports:
# debugpy
- "5678:5678"
- "${CKAN_OGC_DEV_PORT}:${CKAN_OGC_DEV_PORT}"
volumes:
- ./log:${APP_DIR}/log
- ./metadata:${APP_DIR}/metadata
Expand Down
53 changes: 39 additions & 14 deletions ogc2ckan/config/ckan_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import logging
import requests
import os
import ssl

# third-party libraries
import psycopg2
from bs4 import BeautifulSoup
import urllib.request

# custom functions
from config.ogc2ckan_config import get_log_module, load_yaml
Expand All @@ -24,9 +26,9 @@ def __init__(self):
self.default_license = os.environ.get('DEFAULT_LICENSE', OGC2CKAN_CKANINFO_CONFIG['default_license'])
self.default_license_id = os.environ.get('DEFAULT_LICENSE_ID', OGC2CKAN_CKANINFO_CONFIG['default_license_id'])
self.ckan_harvester = OGC2CKAN_HARVESTER_CONFIG
self.ssl_unverified_mode = os.environ.get('SSL_UNVERIFIED_MODE', OGC2CKAN_CKANINFO_CONFIG['ssl_unverified_mode'])
self.metadata_distributions = os.environ.get('METADATA_DISTRIBUTIONS', OGC2CKAN_CKANINFO_CONFIG['metadata_distributions'])
self.parallelization = os.environ.get('PARALLELIZATION', OGC2CKAN_CKANINFO_CONFIG['parallelization'])
self.ssl_unverified_mode = True if os.environ.get('SSL_UNVERIFIED_MODE') == 'True' else OGC2CKAN_CKANINFO_CONFIG['ssl_unverified_mode']
self.metadata_distributions = True if os.environ.get('METADATA_DISTRIBUTIONS') == 'True' else OGC2CKAN_CKANINFO_CONFIG['metadata_distributions']
self.parallelization = True if os.environ.get('PARALLELIZATION') == 'True' else OGC2CKAN_CKANINFO_CONFIG['parallelization']
self.dir3_soup = self.get_dir3_soup()
self.ckan_dataset_schema = os.environ.get('CKAN_DATASET_SCHEMA', OGC2CKAN_CKANINFO_CONFIG['ckan_dataset_schema'])

Expand All @@ -39,20 +41,43 @@ def get_dir3_soup(self):
"""
try:
dir3_url = OGC2CKAN_CKANINFO_CONFIG['dir3_url']
response = requests.get(dir3_url)
response.raise_for_status() # Check HTTP status code
self.dir3_soup = BeautifulSoup(response.text, 'html.parser')
except requests.exceptions.HTTPError as errh:
print("HTTP Error:", errh)
request = urllib.request.Request(dir3_url)
response = urllib.request.urlopen(request)

#response = requests.get(dir3_url)
#response.raise_for_status() # Check HTTP status code
assert response.code == 200
self.dir3_soup = BeautifulSoup(response.read(), 'html.parser')

except ssl.CertificateError:
if self.ssl_unverified_mode == True or self.ssl_unverified_mode.lower() == 'true':
hostname = urllib.parse.urlparse(dir3_url).hostname
port = 443 # Assuming HTTPS (default port)
pem_cert = ssl.get_server_certificate((hostname, port))
ssl_context = ssl.create_default_context(cadata=pem_cert)
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

# Make the HTTPS request using the custom SSL context.
response = urllib.request.urlopen(request, context=ssl_context)

assert response.code == 200
self.dir3_soup = BeautifulSoup(response.read(), 'html.parser')

else:
raise ssl.CertificateError(f"{log_module}:[INSECURE] Put SSL_UNVERIFIED_MODE=True if the host certificate is self-signed or invalid.")

except requests.exceptions.HTTPError as e:
logging.error(f"{log_module}:HTTP Error getting 'dir3_soup' ({dir3_url}): {e}")
self.dir3_soup = None
except requests.exceptions.ConnectionError as errc:
print("Error Connecting:", errc)
except requests.exceptions.ConnectionError as e:
logging.error(f"{log_module}:Error Connecting: 'dir3_soup' ({dir3_url}): {e}")
self.dir3_soup = None
except requests.exceptions.Timeout as errt:
print("Timeout Error:", errt)
except requests.exceptions.Timeout as e:
logging.error(f"{log_module}:Timeout error: 'dir3_soup' ({dir3_url}): {e}")
self.dir3_soup = None
except requests.exceptions.RequestException as err:
print("Something went wrong:", err)
except requests.exceptions.RequestException as e:
logging.error(f"{log_module}:Something went wrong: 'dir3_soup' ({dir3_url}): {e}")
self.dir3_soup = None

return self.dir3_soup
Expand Down
35 changes: 19 additions & 16 deletions ogc2ckan/controller/ckan_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# third-party libraries
import urllib.request
from pprint import pprint
from pprint import pprint, pformat

# custom functions
from config.ogc2ckan_config import get_log_module
Expand Down Expand Up @@ -101,14 +101,14 @@ def create_ckan_datasets(ckan_site_url: str, authorization_key: str, datasets: o
for dataset in datasets:
try:
if workspaces is not None and not any(x.lower() in dataset.ogc_workspace.lower() for x in workspaces):
continue
break
data = dataset.generate_data()
if data is None:
if data is not None:
create_ckan_dataset(ckan_site_url, ssl_unverified_mode, data, authorization_key)
ckan_dataset_count += 1

except Exception as e:
print(f"\nckan_site_url: {ckan_site_url}\nERROR: {e}\nWhile trying to create: {dataset.name} | {dataset.title}\n{pprint.pformat(dataset.dataset_dict())}\n", file=sys.stderr)
print(f"\nckan_site_url: {ckan_site_url}\nERROR: {e}\nWhile trying to create: {dataset.name} | {dataset.title}\n{pformat(dataset.dataset_dict())}\n", file=sys.stderr)
error_dict = {'title': dataset.title, 'error': str(e)}
if hasattr(dataset, 'inspire_id') and dataset.inspire_id:
error_dict['inspire_id'] = dataset.inspire_id
Expand Down Expand Up @@ -310,18 +310,21 @@ def get_ckan_datasets_list(ckan_site_url: str, ssl_unverified_mode: bool, author
# We'll use the package_search function to list all datasets with fields as need.
url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list'].format(fields=fields, rows=rows, include_private=include_private)
response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True)
results = response['result']['results']
count = response['result']['count']
# if response['result']['count'] > rows then we need to paginate the results.
if count > rows:
# Calculate the number of pages we need to paginate through.
pages = count // rows + 1
# Paginate through the results.
for page in range(2, pages + 1):
url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list_paginate'].format(fields=fields, rows=rows, include_private=include_private, start=rows * (page - 1))
response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True)
results += response['result']['results']

if response is not None:
results = response['result']['results']
count = response['result']['count']
# if response['result']['count'] > rows then we need to paginate the results.
if count > rows:
# Calculate the number of pages we need to paginate through.
pages = count // rows + 1
# Paginate through the results.
for page in range(2, pages + 1):
url = ckan_site_url + OGC2CKAN_CKAN_API_ROUTES['get_ckan_datasets_list_paginate'].format(fields=fields, rows=rows, include_private=include_private, start=rows * (page - 1))
response = make_request(url=url, ssl_unverified_mode=ssl_unverified_mode, authorization_key=authorization_key, return_result=True)
results += response['result']['results']
else:
results = []

return results

def get_ckan_dataset_info(ckan_site_url: str, ssl_unverified_mode: bool, authorization_key: Optional[str] = None, field: str = 'id', field_value: Optional[str] = None) -> None:
Expand Down
14 changes: 11 additions & 3 deletions ogc2ckan/harvesters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class Harvester:
get_all_datasets(self, ckan_info): Gets all datasets from the server.

'''
def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, default_dcat_info):
def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, default_dcat_info):
self.app_dir = app_dir
self.url = url
self.name = name
Expand All @@ -97,6 +97,7 @@ def __init__(self, app_dir, url, name, groups, active, organization, type, custo
self.default_dcat_info = DCATInfo(default_dcat_info) if default_dcat_info else None
self.default_keywords = default_keywords
self.default_inspire_info = default_inspire_info
self.ckan_name_not_uuid = ckan_name_not_uuid or False
self.datasets = []
self.datadictionaries = []
self.ckan_dataset_count = 0
Expand Down Expand Up @@ -182,7 +183,7 @@ def get_dataset_common_elements(self, record: str, ckan_dataset_schema: str) ->
- dataset (object): The CKAN dataset class based on the schema.
- distribution (object): The CKAN distribution class based on the schema.
- uuid_identifier (str): A UUID identifier for the dataset.
- ckan_name (str): The CKAN name for the dataset, based on the UUID identifier and organization.
- ckan_name (str): The CKAN name for the dataset, based on the UUID or the identifier and organization.
- ckan_groups (list): A list of CKAN groups for the dataset.
- inspire_id (str): The INSPIRE ID for the dataset.
"""
Expand All @@ -199,7 +200,13 @@ def get_dataset_common_elements(self, record: str, ckan_dataset_schema: str) ->
datadictionaryfield = schema ["datadictionaryfield"]

uuid_identifier = self._create_uuid_identifier()
ckan_name = uuid_identifier

# Use ckan_name instead of uuid_identifier if required
if self.ckan_name_not_uuid:
ckan_name = self._get_ckan_name(record, self.organization)
uuid_identifier = ckan_name
else:
ckan_name = uuid_identifier

ckan_groups = [{'name': g.lower()} for g in self.groups or []]

Expand Down Expand Up @@ -501,6 +508,7 @@ def _create_harvester_from_server(harvest_server, harvester_class):
private_datasets=harvest_server.private_datasets,
default_keywords=harvest_server.default_keywords,
default_inspire_info=harvest_server.default_inspire_info,
ckan_name_not_uuid=harvest_server.ckan_name_not_uuid,
**harvest_server.default_dcat_info
)

Expand Down
6 changes: 3 additions & 3 deletions ogc2ckan/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, **entries):
setattr(self, key, value)

class HarvesterCSW(Harvester):
def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, constraints, **default_dcat_info):
def __init__(self, app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, ckan_name_not_uuid, constraints, **default_dcat_info):
super().__init__(app_dir, url, name, groups, active, organization, type, custom_organization_active, custom_organization_mapping_file, private_datasets, default_keywords, default_inspire_info, **default_dcat_info)
self.constraints = constraints
self.csw = None
Expand Down Expand Up @@ -278,10 +278,10 @@ def get_dataset(self, ckan_info: CKANInfo, record: str, service_type: str):
dataset.set_spatial_uri(self.get_custom_metadata_value(custom_metadata, 'spatial_uri'))

# Set temporal coverage
if layer_info.identification.temporalextent_end and layer_info.identification.temporalextent_start:
try:
dataset.set_temporal_start(layer_info.identification.temporalextent_start)
dataset.set_temporal_end(layer_info.identification.temporalextent_end)
else:
except AttributeError:
dataset.set_temporal_start(self.get_custom_metadata_value(custom_metadata, 'temporal_start'))
dataset.set_temporal_end(self.get_custom_metadata_value(custom_metadata, 'temporal_end'))

Expand Down
Loading