From 5da0857f116b56e89aba39c7cd5bf071af7bd629 Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Mon, 6 Jan 2020 18:37:40 +0200 Subject: [PATCH 01/21] Prepare codebase --- ckanext/dcat/cli.py | 25 +++ ckanext/dcat/commands.py | 29 +--- ckanext/dcat/controllers.py | 116 ++------------ ckanext/dcat/exceptions.py | 8 + ckanext/dcat/logic.py | 2 +- .../dcat/{plugins.py => plugins/__init__.py} | 81 ++++------ ckanext/dcat/plugins/flask_plugin.py | 30 ++++ ckanext/dcat/plugins/pylons_plugin.py | 52 ++++++ ckanext/dcat/processors.py | 11 +- ckanext/dcat/templates/home/index.html | 12 +- ckanext/dcat/templates/package/read_base.html | 22 +-- ckanext/dcat/templates/package/search.html | 12 +- ckanext/dcat/utils.py | 150 +++++++++++++++++- ckanext/dcat/views.py | 42 +++++ requirements.txt | 2 +- 15 files changed, 382 insertions(+), 212 deletions(-) create mode 100644 ckanext/dcat/cli.py create mode 100644 ckanext/dcat/exceptions.py rename ckanext/dcat/{plugins.py => plugins/__init__.py} (65%) create mode 100644 ckanext/dcat/plugins/flask_plugin.py create mode 100644 ckanext/dcat/plugins/pylons_plugin.py create mode 100644 ckanext/dcat/views.py diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py new file mode 100644 index 00000000..ade76959 --- /dev/null +++ b/ckanext/dcat/cli.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +import click +import ckan.plugins.toolkit as tk +import ckanext.dcat.utils as utils + +@click.group() +def generate_static(): + """Generates static files containing all datasets. + + """ + pass + +@generate_static.command() +@click.argument('output', type=click.File(mode="w")) +def json(output): + """The generate command will generate a static file containing all of + the datasets in the catalog in JSON format. + + """ + utils.generate_static_json(output) + + +def get_commands(): + return [generate_static] diff --git a/ckanext/dcat/commands.py b/ckanext/dcat/commands.py index ff68bb78..39f07b93 100644 --- a/ckanext/dcat/commands.py +++ b/ckanext/dcat/commands.py @@ -1,9 +1,10 @@ -import json -import logging +# -*- coding: utf-8 -*- -from pylons import config +import logging from ckan import plugins as p +import ckanext.dcat.utils as utils + class GenerateStaticDCATCommand(p.toolkit.CkanCommand): """ @@ -42,25 +43,5 @@ def generate(self, output): Keep reading and converting datasets until we get an empty list back from dcat_datasets_list """ - data_dict = {'page': 0} - with open(output, 'w') as f: - f.write(u"[") - - while True: - try: - data_dict['page'] = data_dict['page'] + 1 - datasets = \ - p.toolkit.get_action('dcat_datasets_list')({}, - data_dict) - except p.toolkit.ValidationError, e: - self.log.exception(e) - break - - if not datasets: - break - - for dataset in datasets: - f.write(json.dumps(dataset)) - - f.write(u"]") + utils.generate_static_json(f) diff --git a/ckanext/dcat/controllers.py b/ckanext/dcat/controllers.py index 4aeb5831..f8dc8daa 100644 --- a/ckanext/dcat/controllers.py +++ b/ckanext/dcat/controllers.py @@ -1,124 +1,28 @@ import json - -from ckan import model from ckan.plugins import toolkit +import ckanext.dcat.utils as utils + + if toolkit.check_ckan_version(min_version='2.1'): BaseController = toolkit.BaseController else: from ckan.lib.base import BaseController -if toolkit.check_ckan_version(max_version='2.8.99'): - from ckan.controllers.package import PackageController - from ckan.controllers.home import HomeController - read_endpoint = PackageController().read - index_endpoint = HomeController().index -else: - from ckan.views.home import index as index_endpoint - from ckan.views.dataset import read as read_endpoint - -from ckanext.dcat.utils import CONTENT_TYPES, parse_accept_header -from ckanext.dcat.processors import RDFProfileException - - -def _get_package_type(id): - """ - Given the id of a package this method will return the type of the - package, or 'dataset' if no type is currently set - """ - pkg = model.Package.get(id) - if pkg: - return pkg.type or u'dataset' - return None - - -def check_access_header(): - _format = None - - # Check Accept headers - accept_header = toolkit.request.headers.get('Accept', '') - if accept_header: - _format = parse_accept_header(accept_header) - return _format - class DCATController(BaseController): def read_catalog(self, _format=None): - - if not _format: - _format = check_access_header() - - if not _format: - return index_endpoint() - - _profiles = toolkit.request.params.get('profiles') - if _profiles: - _profiles = _profiles.split(',') - - data_dict = { - 'page': toolkit.request.params.get('page'), - 'modified_since': toolkit.request.params.get('modified_since'), - 'q': toolkit.request.params.get('q'), - 'fq': toolkit.request.params.get('fq'), - 'format': _format, - 'profiles': _profiles, - } - - toolkit.response.headers.update( - {'Content-type': CONTENT_TYPES[_format]}) - try: - return toolkit.get_action('dcat_catalog_show')({}, data_dict) - except (toolkit.ValidationError, RDFProfileException) as e: - toolkit.abort(409, str(e)) + return utils.read_catalog_page(_format) def read_dataset(self, _id, _format=None): - - if not _format: - _format = check_access_header() - - if not _format: - if toolkit.check_ckan_version(max_version='2.8.99'): - return read_endpoint(_id) - else: - return read_endpoint(_get_package_type(_id), _id) - - _profiles = toolkit.request.params.get('profiles') - if _profiles: - _profiles = _profiles.split(',') - - toolkit.response.headers.update( - {'Content-type': CONTENT_TYPES[_format]}) - - try: - result = toolkit.get_action('dcat_dataset_show')({}, {'id': _id, - 'format': _format, 'profiles': _profiles}) - except toolkit.ObjectNotFound: - toolkit.abort(404) - except (toolkit.ValidationError, RDFProfileException) as e: - toolkit.abort(409, str(e)) - - return result + return utils.read_dataset_page(_id, _format) def dcat_json(self): + datasets = utils.dcat_json_page() + content = json.dumps(datasets) - data_dict = { - 'page': toolkit.request.params.get('page'), - 'modified_since': toolkit.request.params.get('modified_since'), - } - - try: - datasets = toolkit.get_action('dcat_datasets_list')({}, - data_dict) - except toolkit.ValidationError, e: - toolkit.abort(409, str(e)) - - content = json.dumps(datasets) - - toolkit.response.headers['Content-Type'] = 'application/json' - toolkit.response.headers['Content-Length'] = len(content) - - return content - - + toolkit.response.headers['Content-Type'] = 'application/json' + toolkit.response.headers['Content-Length'] = len(content) + return content diff --git a/ckanext/dcat/exceptions.py b/ckanext/dcat/exceptions.py new file mode 100644 index 00000000..3d982bb1 --- /dev/null +++ b/ckanext/dcat/exceptions.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- + +class RDFParserException(Exception): + pass + + +class RDFProfileException(Exception): + pass diff --git a/ckanext/dcat/logic.py b/ckanext/dcat/logic.py index 6925911e..11971fc6 100644 --- a/ckanext/dcat/logic.py +++ b/ckanext/dcat/logic.py @@ -1,7 +1,7 @@ from __future__ import division import math -from pylons import config +from ckantoolkit import config from dateutil.parser import parse as dateutil_parse from ckan.plugins import toolkit diff --git a/ckanext/dcat/plugins.py b/ckanext/dcat/plugins/__init__.py similarity index 65% rename from ckanext/dcat/plugins.py rename to ckanext/dcat/plugins/__init__.py index cdd8c393..8b011018 100644 --- a/ckanext/dcat/plugins.py +++ b/ckanext/dcat/plugins/__init__.py @@ -1,3 +1,7 @@ +# -*- coding: utf-8 -*- + +import os + from pylons import config from ckan import plugins as p @@ -16,27 +20,42 @@ class DefaultTranslation(): ) from ckanext.dcat import utils -DEFAULT_CATALOG_ENDPOINT = '/catalog.{_format}' +if p.toolkit.check_ckan_version('2.9'): + from ckanext.dcat.plugins.flask_plugin import ( + MixinDCATPlugin, MixinDCATJSONInterface + ) +else: + from ckanext.dcat.plugins.pylons_plugin import ( + MixinDCATPlugin, MixinDCATJSONInterface + ) + + CUSTOM_ENDPOINT_CONFIG = 'ckanext.dcat.catalog_endpoint' -ENABLE_RDF_ENDPOINTS_CONFIG = 'ckanext.dcat.enable_rdf_endpoints' -ENABLE_CONTENT_NEGOTIATION_CONFIG = 'ckanext.dcat.enable_content_negotiation' TRANSLATE_KEYS_CONFIG = 'ckanext.dcat.translate_keys' +HERE = os.path.abspath(os.path.dirname(__file__)) +I18N_DIR = os.path.join(HERE, u"../i18n") -class DCATPlugin(p.SingletonPlugin, DefaultTranslation): + +class DCATPlugin(MixinDCATPlugin, p.SingletonPlugin, DefaultTranslation): p.implements(p.IConfigurer, inherit=True) p.implements(p.ITemplateHelpers, inherit=True) - p.implements(p.IRoutes, inherit=True) p.implements(p.IActions, inherit=True) p.implements(p.IAuthFunctions, inherit=True) p.implements(p.IPackageController, inherit=True) if p.toolkit.check_ckan_version(min_version='2.5.0'): p.implements(p.ITranslation, inherit=True) + # ITranslation + + def i18n_directory(self): + return I18N_DIR + # IConfigurer + def update_config(self, config): - p.toolkit.add_template_directory(config, 'templates') + p.toolkit.add_template_directory(config, '../templates') # Check catalog URI on startup to emit a warning if necessary utils.catalog_uri() @@ -54,41 +73,14 @@ def update_config(self, config): CUSTOM_ENDPOINT_CONFIG)) # ITemplateHelpers + def get_helpers(self): return { 'helper_available': utils.helper_available, } - # IRoutes - def before_map(self, _map): - - controller = 'ckanext.dcat.controllers:DCATController' - - if p.toolkit.asbool(config.get(ENABLE_RDF_ENDPOINTS_CONFIG, True)): - - _map.connect('dcat_catalog', - config.get('ckanext.dcat.catalog_endpoint', - DEFAULT_CATALOG_ENDPOINT), - controller=controller, action='read_catalog', - requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) - - _map.connect('dcat_dataset', '/dataset/{_id}.{_format}', - controller=controller, action='read_dataset', - requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) - - if p.toolkit.asbool(config.get(ENABLE_CONTENT_NEGOTIATION_CONFIG)): - - _map.connect('home', '/', controller=controller, - action='read_catalog') - - _map.connect('add dataset', '/dataset/new', controller='package', action='new') - _map.connect('dataset_read', '/dataset/{_id}', - controller=controller, action='read_dataset', - ckan_icon='sitemap') - - return _map - # IActions + def get_actions(self): return { 'dcat_dataset_show': dcat_dataset_show, @@ -97,6 +89,7 @@ def get_actions(self): } # IAuthFunctions + def get_auth_functions(self): return { 'dcat_dataset_show': dcat_auth, @@ -105,6 +98,7 @@ def get_auth_functions(self): } # IPackageController + def after_show(self, context, data_dict): # check if config is enabled to translate keys (default: True) @@ -130,28 +124,19 @@ def set_titles(object_dict): return data_dict -class DCATJSONInterface(p.SingletonPlugin): - - p.implements(p.IRoutes, inherit=True) +class DCATJSONInterface(MixinDCATJSONInterface, p.SingletonPlugin): p.implements(p.IActions) p.implements(p.IAuthFunctions, inherit=True) - # IRoutes - def after_map(self, map): - - controller = 'ckanext.dcat.controllers:DCATController' - route = config.get('ckanext.dcat.json_endpoint', '/dcat.json') - map.connect(route, controller=controller, action='dcat_json') - - return map - # IActions + def get_actions(self): return { 'dcat_datasets_list': dcat_datasets_list, } # IAuthFunctions + def get_auth_functions(self): return { 'dcat_datasets_list': dcat_auth, @@ -162,8 +147,8 @@ class StructuredDataPlugin(p.SingletonPlugin): p.implements(p.ITemplateHelpers, inherit=True) # ITemplateHelpers + def get_helpers(self): return { 'structured_data': utils.structured_data, } - diff --git a/ckanext/dcat/plugins/flask_plugin.py b/ckanext/dcat/plugins/flask_plugin.py new file mode 100644 index 00000000..68abe71b --- /dev/null +++ b/ckanext/dcat/plugins/flask_plugin.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +import ckan.plugins as p + +import ckanext.dcat.cli as cli +import ckanext.dcat.views as views + + +class MixinDCATPlugin(p.SingletonPlugin): + p.implements(p.IClick) + p.implements(p.IBlueprint) + + # IClick + + def get_commands(self): + return cli.get_commands() + + # IBlueprint + + def get_blueprint(self): + return [views.dcat] + + +class MixinDCATJSONInterface(p.SingletonPlugin): + p.implements(p.IBlueprint) + + # IBlueprint + + def get_blueprint(self): + return [views.dcat_json_interface] diff --git a/ckanext/dcat/plugins/pylons_plugin.py b/ckanext/dcat/plugins/pylons_plugin.py new file mode 100644 index 00000000..93848341 --- /dev/null +++ b/ckanext/dcat/plugins/pylons_plugin.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +from ckantoolkit import config +import ckan.plugins as p + +import ckanext.dcat.utils as utils + +class MixinDCATPlugin(p.SingletonPlugin): + p.implements(p.IRoutes, inherit=True) + + # IRoutes + + def before_map(self, _map): + + controller = 'ckanext.dcat.controllers:DCATController' + + if p.toolkit.asbool(config.get(utils.ENABLE_RDF_ENDPOINTS_CONFIG, True)): + + _map.connect('dcat_catalog', + config.get('ckanext.dcat.catalog_endpoint', + utils.DEFAULT_CATALOG_ENDPOINT), + controller=controller, action='read_catalog', + requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) + + _map.connect('dcat_dataset', '/dataset/{_id}.{_format}', + controller=controller, action='read_dataset', + requirements={'_format': 'xml|rdf|n3|ttl|jsonld'}) + + if p.toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)): + + _map.connect('home', '/', controller=controller, + action='read_catalog') + + _map.connect('add dataset', '/dataset/new', controller='package', action='new') + _map.connect('dataset_read', '/dataset/{_id}', + controller=controller, action='read_dataset', + ckan_icon='sitemap') + + return _map + +class MixinDCATJSONInterface(p.SingletonPlugin): + p.implements(p.IRoutes, inherit=True) + + # IRoutes + + def after_map(self, map): + + controller = 'ckanext.dcat.controllers:DCATController' + route = config.get('ckanext.dcat.json_endpoint', '/dcat.json') + map.connect(route, controller=controller, action='dcat_json') + + return map diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 96b132a7..5a966a8a 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -6,7 +6,7 @@ import json from pkg_resources import iter_entry_points -from pylons import config +from ckantoolkit import config import rdflib import rdflib.parser @@ -17,7 +17,7 @@ from ckanext.dcat.utils import catalog_uri, dataset_uri, url_to_rdflib_format, DCAT_EXPOSE_SUBCATALOGS from ckanext.dcat.profiles import DCAT, DCT, FOAF - +from ckanext.dcat.exceptions import RDFProfileException, RDFParserException HYDRA = Namespace('http://www.w3.org/ns/hydra/core#') DCAT = Namespace("http://www.w3.org/ns/dcat#") @@ -29,13 +29,6 @@ DEFAULT_RDF_PROFILES = ['euro_dcat_ap'] -class RDFParserException(Exception): - pass - - -class RDFProfileException(Exception): - pass - class RDFProcessor(object): diff --git a/ckanext/dcat/templates/home/index.html b/ckanext/dcat/templates/home/index.html index 0706b0b2..b72050fa 100644 --- a/ckanext/dcat/templates/home/index.html +++ b/ckanext/dcat/templates/home/index.html @@ -1,8 +1,10 @@ {% ckan_extends %} {% block links %} - {{ super() }} - - - - + {{ super() }} + {% with endpoint='dcat.read_catalog' if h.ckan_version() > '2.9' else 'dcat_catalog' %} + + + + + {% endwith %} {% endblock -%} diff --git a/ckanext/dcat/templates/package/read_base.html b/ckanext/dcat/templates/package/read_base.html index 3cd145e0..e1cf784e 100644 --- a/ckanext/dcat/templates/package/read_base.html +++ b/ckanext/dcat/templates/package/read_base.html @@ -1,10 +1,12 @@ {% ckan_extends %} {% block links %} - {{ super() }} - - - - + {{ super() }} + {% with endpoint='dcat.read_dataset' if h.ckan_version() > '2.9' else 'dcat_dataset' %} + + + + + {% endwith %} {% endblock -%} {% block body_extras %} {{ super() }} @@ -17,10 +19,10 @@ https://developers.google.com/search/docs/guides/intro-structured-data #} - {% if h.helper_available('structured_data') %} - - {% endif %} + {% if h.helper_available('structured_data') %} + + {% endif %} {% endblock %} {% endblock %} diff --git a/ckanext/dcat/templates/package/search.html b/ckanext/dcat/templates/package/search.html index 0706b0b2..b72050fa 100644 --- a/ckanext/dcat/templates/package/search.html +++ b/ckanext/dcat/templates/package/search.html @@ -1,8 +1,10 @@ {% ckan_extends %} {% block links %} - {{ super() }} - - - - + {{ super() }} + {% with endpoint='dcat.read_catalog' if h.ckan_version() > '2.9' else 'dcat_catalog' %} + + + + + {% endwith %} {% endblock -%} diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py index be619cb1..297c2b61 100644 --- a/ckanext/dcat/utils.py +++ b/ckanext/dcat/utils.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import logging import uuid import json @@ -15,6 +17,17 @@ class HelperError(Exception): from ckan import model import ckan.plugins.toolkit as toolkit +from ckanext.dcat.exceptions import RDFProfileException + +if toolkit.check_ckan_version(max_version='2.8.99'): + from ckan.controllers.package import PackageController + from ckan.controllers.home import HomeController + read_endpoint = PackageController().read + index_endpoint = HomeController().index +else: + from ckan.views.home import index as index_endpoint + from ckan.views.dataset import read as read_endpoint + _ = toolkit._ log = logging.getLogger(__name__) @@ -31,6 +44,21 @@ class HelperError(Exception): DCAT_CLEAN_TAGS = 'ckanext.dcat.clean_tags' +DEFAULT_CATALOG_ENDPOINT = '/catalog.{_format}' +ENABLE_RDF_ENDPOINTS_CONFIG = 'ckanext.dcat.enable_rdf_endpoints' +ENABLE_CONTENT_NEGOTIATION_CONFIG = 'ckanext.dcat.enable_content_negotiation' + + +def _get_package_type(id): + """ + Given the id of a package this method will return the type of the + package, or 'dataset' if no type is currently set + """ + pkg = model.Package.get(id) + if pkg: + return pkg.type or u'dataset' + return None + def field_labels(): ''' @@ -92,9 +120,9 @@ def structured_data(dataset_id, profiles=None, _format='jsonld'): data = toolkit.get_action('dcat_dataset_show')( {}, { - 'id': dataset_id, - 'profiles': profiles, - 'format': _format, + 'id': dataset_id, + 'profiles': profiles, + 'format': _format, } ) # parse result again to prevent UnicodeDecodeError and add formatting @@ -337,3 +365,119 @@ def parse_accept_header(accept_header=''): return accepted_media_types_wildcard[_type] return None + + +def generate_static_json(output): + data_dict = {'page': 0} + + output.write(u"[") + + while True: + try: + data_dict['page'] = data_dict['page'] + 1 + datasets = \ + toolkit.get_action('dcat_datasets_list')({}, + data_dict) + except toolkit.ValidationError as e: + log.exception(e) + break + + if not datasets: + break + + for dataset in datasets: + output.write(json.dumps(dataset)) + + output.write(u"]") + + +def check_access_header(): + _format = None + + # Check Accept headers + accept_header = toolkit.request.headers.get('Accept', '') + if accept_header: + _format = parse_accept_header(accept_header) + return _format + + +def dcat_json_page(): + data_dict = { + 'page': toolkit.request.params.get('page'), + 'modified_since': toolkit.request.params.get('modified_since'), + } + + try: + datasets = toolkit.get_action('dcat_datasets_list')({}, + data_dict) + except toolkit.ValidationError, e: + return toolkit.abort(409, str(e)) + + return datasets + + +def read_dataset_page(_id, _format): + if not _format: + _format = check_access_header() + + if not _format: + if toolkit.check_ckan_version(max_version='2.8.99'): + return read_endpoint(_id) + else: + return read_endpoint(_get_package_type(_id), _id) + + _profiles = toolkit.request.params.get('profiles') + if _profiles: + _profiles = _profiles.split(',') + + try: + response = toolkit.get_action('dcat_dataset_show')({}, {'id': _id, + 'format': _format, 'profiles': _profiles}) + except toolkit.ObjectNotFound: + toolkit.abort(404) + except (toolkit.ValidationError, RDFProfileException) as e: + toolkit.abort(409, str(e)) + + if toolkit.check_ckan_version(max_version='2.8.99'): + toolkit.response.headers.update({'Content-type': CONTENT_TYPES[_format]}) + else: + from flask import make_response + response = make_response(response) + response.headers['Content-type'] = CONTENT_TYPES[_format] + + return response + +def read_catalog_page(_format): + if not _format: + _format = check_access_header() + + if not _format: + return index_endpoint() + + _profiles = toolkit.request.params.get('profiles') + if _profiles: + _profiles = _profiles.split(',') + + data_dict = { + 'page': toolkit.request.params.get('page'), + 'modified_since': toolkit.request.params.get('modified_since'), + 'q': toolkit.request.params.get('q'), + 'fq': toolkit.request.params.get('fq'), + 'format': _format, + 'profiles': _profiles, + } + + try: + response = toolkit.get_action('dcat_catalog_show')({}, data_dict) + except (toolkit.ValidationError, RDFProfileException) as e: + toolkit.abort(409, str(e)) + + if toolkit.check_ckan_version(max_version='2.8.99'): + toolkit.response.headers.update( + {'Content-type': CONTENT_TYPES[_format]}) + else: + from flask import make_response + response = make_response(response) + response.headers['Content-type'] = CONTENT_TYPES[_format] + + return response diff --git a/ckanext/dcat/views.py b/ckanext/dcat/views.py new file mode 100644 index 00000000..3f76684b --- /dev/null +++ b/ckanext/dcat/views.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +from flask import Blueprint, jsonify, make_response + +from ckantoolkit import config + +import ckan.plugins.toolkit as toolkit +import ckanext.dcat.utils as utils + +dcat = Blueprint('dcat', __name__) + + +def read_catalog(_format=None): + return utils.read_catalog_page(_format) + + +def read_dataset(_id, _format): + return utils.read_dataset_page(_id, _format) + + +if toolkit.asbool(config.get(utils.ENABLE_RDF_ENDPOINTS_CONFIG, True)): + + # requirements={'_format': 'xml|rdf|n3|ttl|jsonld'} + dcat.add_url_rule(config.get('ckanext.dcat.catalog_endpoint', + utils.DEFAULT_CATALOG_ENDPOINT).replace( + '{_format}', '<_format>'), + view_func=read_catalog) + dcat.add_url_rule('/dataset/<_id>.<_format>', view_func=read_dataset) + +if toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)): + dcat.add_url_rule('/', view_func=read_catalog, endpoint="home.index") + +dcat_json_interface = Blueprint('dcat_json_interface', __name__) + + +def dcat_json(): + datasets = utils.dcat_json_page() + return jsonify(datasets) + + +dcat_json_interface.add_url_rule(config.get('ckanext.dcat.json_endpoint', + '/dcat.json'), + view_func=dcat_json) diff --git a/requirements.txt b/requirements.txt index 65cbc765..663e65dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ rdflib==4.2.1 rdflib-jsonld==0.4.0 -git+https://github.com/geomet/geomet.git +geomet>=0.2.0 ckantoolkit==0.0.3 From 6d7aa35d35abda82cff6eb09059cbf928c701034 Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Mon, 6 Jan 2020 18:38:33 +0200 Subject: [PATCH 02/21] Futurize. 1st stage --- ckanext/dcat/harvesters/_json.py | 6 +++--- ckanext/dcat/harvesters/base.py | 6 +++--- ckanext/dcat/harvesters/rdf.py | 10 +++++----- ckanext/dcat/processors.py | 2 +- ckanext/dcat/tests/test_harvester.py | 2 +- ckanext/dcat/tests/test_json_harvester.py | 3 ++- ckanext/dcat/utils.py | 2 +- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index b77f4f7b..487eaad4 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -87,7 +87,7 @@ def gather_stage(self, harvest_job): try: content, content_type = \ self._get_content_and_type(url, harvest_job, page) - except requests.exceptions.HTTPError, error: + except requests.exceptions.HTTPError as error: if error.response.status_code == 404: if page > 1: # Server returned a 404 after the first page, no more @@ -144,7 +144,7 @@ def gather_stage(self, harvest_job): # Empty document, no more ids break - except ValueError, e: + except ValueError as e: msg = 'Error parsing file: {0}'.format(str(e)) self._save_gather_error(msg, harvest_job) return None @@ -283,7 +283,7 @@ def import_stage(self, harvest_object): package_id = p.toolkit.get_action(action)(context, package_dict) log.info('%s dataset with id %s', message_status, package_id) - except Exception, e: + except Exception as e: dataset = json.loads(harvest_object.content) dataset_name = dataset.get('name', '') diff --git a/ckanext/dcat/harvesters/base.py b/ckanext/dcat/harvesters/base.py index a33a4127..aadbc0cd 100644 --- a/ckanext/dcat/harvesters/base.py +++ b/ckanext/dcat/harvesters/base.py @@ -97,7 +97,7 @@ def _get_content_and_type(self, url, harvest_job, page=1, return content, content_type - except requests.exceptions.HTTPError, error: + except requests.exceptions.HTTPError as error: if page > 1 and error.response.status_code == 404: # We want to catch these ones later on raise @@ -106,12 +106,12 @@ def _get_content_and_type(self, url, harvest_job, page=1, % (url, error.response.status_code, error.response.reason) self._save_gather_error(msg, harvest_job) return None, None - except requests.exceptions.ConnectionError, error: + except requests.exceptions.ConnectionError as error: msg = '''Could not get content from %s because a connection error occurred. %s''' % (url, error) self._save_gather_error(msg, harvest_job) return None, None - except requests.exceptions.Timeout, error: + except requests.exceptions.Timeout as error: msg = 'Could not get content from %s because the connection timed'\ ' out.' % url self._save_gather_error(msg, harvest_job) diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index ddb215b3..6c590490 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -197,7 +197,7 @@ def gather_stage(self, harvest_job): try: parser.parse(content, _format=rdf_format) - except RDFParserException, e: + except RDFParserException as e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return [] @@ -235,7 +235,7 @@ def gather_stage(self, harvest_job): obj.save() object_ids.append(obj.id) - except Exception, e: + except Exception as e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) return [] @@ -336,7 +336,7 @@ def import_stage(self, harvest_object): else: log.info('Ignoring dataset %s' % existing_dataset['name']) return 'unchanged' - except p.toolkit.ValidationError, e: + except p.toolkit.ValidationError as e: self._save_object_error('Update validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False @@ -381,7 +381,7 @@ def import_stage(self, harvest_object): else: log.info('Ignoring dataset %s' % name) return 'unchanged' - except p.toolkit.ValidationError, e: + except p.toolkit.ValidationError as e: self._save_object_error('Create validation Error: %s' % str(e.error_summary), harvest_object, 'Import') return False @@ -394,7 +394,7 @@ def import_stage(self, harvest_object): log.info('Created dataset %s' % dataset['name']) - except Exception, e: + except Exception as e: self._save_object_error('Error importing dataset %s: %r / %s' % (dataset.get('name', ''), e, traceback.format_exc()), harvest_object, 'Import') return False diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 5a966a8a..141860c4 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -147,7 +147,7 @@ def parse(self, data, _format=None): # exceptions are not cached, add them here. # PluginException indicates that an unknown format was passed. except (SyntaxError, xml.sax.SAXParseException, - rdflib.plugin.PluginException, TypeError), e: + rdflib.plugin.PluginException, TypeError) as e: raise RDFParserException(e) diff --git a/ckanext/dcat/tests/test_harvester.py b/ckanext/dcat/tests/test_harvester.py index 67b41f26..fa686c56 100644 --- a/ckanext/dcat/tests/test_harvester.py +++ b/ckanext/dcat/tests/test_harvester.py @@ -570,7 +570,7 @@ def _run_jobs(self, harvest_source_id=None): try: h.call_action('harvest_jobs_run', {}, source_id=harvest_source_id) - except Exception, e: + except Exception as e: if (str(e) == 'There are no new harvesting jobs'): pass diff --git a/ckanext/dcat/tests/test_json_harvester.py b/ckanext/dcat/tests/test_json_harvester.py index 0ee1f1a5..a172be0d 100644 --- a/ckanext/dcat/tests/test_json_harvester.py +++ b/ckanext/dcat/tests/test_json_harvester.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import httpretty from mock import call, patch, Mock @@ -9,7 +10,7 @@ import ckan.tests.factories as factories from ckanext.dcat.harvesters._json import copy_across_resource_ids, DCATJSONHarvester -from test_harvester import FunctionalHarvestTest +from .test_harvester import FunctionalHarvestTest eq_ = nose.tools.eq_ diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py index 297c2b61..a6101ac1 100644 --- a/ckanext/dcat/utils.py +++ b/ckanext/dcat/utils.py @@ -410,7 +410,7 @@ def dcat_json_page(): try: datasets = toolkit.get_action('dcat_datasets_list')({}, data_dict) - except toolkit.ValidationError, e: + except toolkit.ValidationError as e: return toolkit.abort(409, str(e)) return datasets From c6d9f0bebc6ba887d334ca589146158400e43984 Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Mon, 6 Jan 2020 18:40:29 +0200 Subject: [PATCH 03/21] Futurize. 2nd stage --- ckanext/dcat/converters.py | 1 + ckanext/dcat/harvesters/_json.py | 7 ++- ckanext/dcat/harvesters/rdf.py | 8 ++- ckanext/dcat/logic.py | 2 +- ckanext/dcat/plugins/__init__.py | 5 +- ckanext/dcat/processors.py | 4 +- ckanext/dcat/profiles.py | 57 ++++++++++--------- ckanext/dcat/tests/test_base_parser.py | 2 + ckanext/dcat/tests/test_base_profile.py | 14 +++-- ckanext/dcat/tests/test_controllers.py | 12 ++-- ckanext/dcat/tests/test_converters.py | 1 + .../tests/test_euro_dcatap_profile_parse.py | 6 +- .../test_euro_dcatap_profile_serialize.py | 22 +++---- ckanext/dcat/tests/test_harvester.py | 7 ++- ckanext/dcat/tests/test_json_harvester.py | 13 +++-- ckanext/dcat/tests/test_logic.py | 22 +++---- .../tests/test_schemaorg_profile_serialize.py | 9 +-- ckanext/dcat/tests/test_utils.py | 1 + ckanext/dcat/utils.py | 7 ++- requirements.txt | 1 + 20 files changed, 117 insertions(+), 84 deletions(-) diff --git a/ckanext/dcat/converters.py b/ckanext/dcat/converters.py index 541a07e8..ddce1723 100644 --- a/ckanext/dcat/converters.py +++ b/ckanext/dcat/converters.py @@ -1,3 +1,4 @@ +from past.builtins import basestring import logging log = logging.getLogger(__name__) diff --git a/ckanext/dcat/harvesters/_json.py b/ckanext/dcat/harvesters/_json.py index 487eaad4..a08fc393 100644 --- a/ckanext/dcat/harvesters/_json.py +++ b/ckanext/dcat/harvesters/_json.py @@ -1,3 +1,4 @@ +from builtins import str import json import logging from hashlib import sha1 @@ -74,7 +75,7 @@ def gather_stage(self, harvest_job): for guid, package_id in query: guid_to_package_id[guid] = package_id - guids_in_db = guid_to_package_id.keys() + guids_in_db = list(guid_to_package_id.keys()) guids_in_source = [] # Get file contents @@ -259,8 +260,8 @@ def import_stage(self, harvest_object): context['schema'] = package_schema # We need to explicitly provide a package ID - package_dict['id'] = unicode(uuid.uuid4()) - package_schema['id'] = [unicode] + package_dict['id'] = str(uuid.uuid4()) + package_schema['id'] = [str] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] diff --git a/ckanext/dcat/harvesters/rdf.py b/ckanext/dcat/harvesters/rdf.py index 6c590490..046bc294 100644 --- a/ckanext/dcat/harvesters/rdf.py +++ b/ckanext/dcat/harvesters/rdf.py @@ -1,3 +1,5 @@ +from builtins import str +from past.builtins import basestring import json import uuid import logging @@ -107,7 +109,7 @@ def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): for guid, package_id in query: guid_to_package_id[guid] = package_id - guids_in_db = guid_to_package_id.keys() + guids_in_db = list(guid_to_package_id.keys()) # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) @@ -356,8 +358,8 @@ def import_stage(self, harvest_object): context['schema'] = package_schema # We need to explicitly provide a package ID - dataset['id'] = unicode(uuid.uuid4()) - package_schema['id'] = [unicode] + dataset['id'] = str(uuid.uuid4()) + package_schema['id'] = [str] harvester_tmp_dict = {} diff --git a/ckanext/dcat/logic.py b/ckanext/dcat/logic.py index 11971fc6..b44fe191 100644 --- a/ckanext/dcat/logic.py +++ b/ckanext/dcat/logic.py @@ -150,7 +150,7 @@ def _page_url(page): base_url = '%s%s' % ( base_url, toolkit.request.path) - params = [p for p in toolkit.request.params.iteritems() + params = [p for p in toolkit.request.params.items() if p[0] != 'page' and p[0] in ('modified_since', 'profiles', 'q', 'fq')] if params: qs = '&'.join(['{0}={1}'.format(p[0], p[1]) for p in params]) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 8b011018..ecf5c065 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from builtins import object import os from pylons import config @@ -8,7 +9,7 @@ try: from ckan.lib.plugins import DefaultTranslation except ImportError: - class DefaultTranslation(): + class DefaultTranslation(object): pass @@ -109,7 +110,7 @@ def after_show(self, context, data_dict): field_labels = utils.field_labels() def set_titles(object_dict): - for key, value in object_dict.iteritems(): + for key, value in object_dict.items(): if key in field_labels: object_dict[field_labels[key]] = object_dict[key] del object_dict[key] diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 141860c4..cc514107 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -1,5 +1,7 @@ from __future__ import print_function +from builtins import str +from builtins import object import sys import argparse import xml @@ -116,7 +118,7 @@ def next_page(self): ''' for pagination_node in self.g.subjects(RDF.type, HYDRA.PagedCollection): for o in self.g.objects(pagination_node, HYDRA.nextPage): - return unicode(o) + return str(o) return None diff --git a/ckanext/dcat/profiles.py b/ckanext/dcat/profiles.py index 71c227a4..f84feee1 100644 --- a/ckanext/dcat/profiles.py +++ b/ckanext/dcat/profiles.py @@ -1,7 +1,12 @@ +from future import standard_library +standard_library.install_aliases() +from builtins import str +from past.builtins import basestring +from builtins import object import datetime import json -from urllib import quote +from urllib.parse import quote from dateutil.parser import parse as parse_date @@ -181,12 +186,12 @@ def _object_value(self, subject, predicate): for o in self.g.objects(subject, predicate): if isinstance(o, Literal): if o.language and o.language == default_lang: - return unicode(o) + return str(o) # Use first object as fallback if no object with the default language is available elif fallback == '': - fallback = unicode(o) + fallback = str(o) else: - return unicode(o) + return str(o) return fallback def _object_value_int(self, subject, predicate): @@ -215,7 +220,7 @@ def _object_value_list(self, subject, predicate): If no values found, returns an empty string ''' - return [unicode(o) for o in self.g.objects(subject, predicate)] + return [str(o) for o in self.g.objects(subject, predicate)] def _time_interval(self, subject, predicate): ''' @@ -296,7 +301,7 @@ def _publisher(self, subject, predicate): for agent in self.g.objects(subject, predicate): - publisher['uri'] = (unicode(agent) if isinstance(agent, + publisher['uri'] = (str(agent) if isinstance(agent, rdflib.term.URIRef) else '') publisher['name'] = self._object_value(agent, FOAF.name) @@ -323,7 +328,7 @@ def _contact_details(self, subject, predicate): for agent in self.g.objects(subject, predicate): - contact['uri'] = (unicode(agent) if isinstance(agent, + contact['uri'] = (str(agent) if isinstance(agent, rdflib.term.URIRef) else '') contact['name'] = self._object_value(agent, VCARD.fn) @@ -358,29 +363,29 @@ def _spatial(self, subject, predicate): for spatial in self.g.objects(subject, predicate): if isinstance(spatial, URIRef): - uri = unicode(spatial) + uri = str(spatial) if isinstance(spatial, Literal): - text = unicode(spatial) + text = str(spatial) if (spatial, RDF.type, DCT.Location) in self.g: for geometry in self.g.objects(spatial, LOCN.geometry): if (geometry.datatype == URIRef(GEOJSON_IMT) or not geometry.datatype): try: - json.loads(unicode(geometry)) - geom = unicode(geometry) + json.loads(str(geometry)) + geom = str(geometry) except (ValueError, TypeError): pass if not geom and geometry.datatype == GSP.wktLiteral: try: - geom = json.dumps(wkt.loads(unicode(geometry))) + geom = json.dumps(wkt.loads(str(geometry))) except (ValueError, TypeError): pass for label in self.g.objects(spatial, SKOS.prefLabel): - text = unicode(label) + text = str(label) for label in self.g.objects(spatial, RDFS.label): - text = unicode(label) + text = str(label) return { 'uri': uri, @@ -403,7 +408,7 @@ def _license(self, dataset_ref): else: license_uri2id = {} license_title2id = {} - for license_id, license in LicenseRegister().items(): + for license_id, license in list(LicenseRegister().items()): license_uri2id[license.url] = license_id license_title2id[license.title] = license_id self._licenceregister_cache = license_uri2id, license_title2id @@ -466,18 +471,18 @@ def _distribution_format(self, distribution, normalize_ckan_format=True): _format = self._object(distribution, DCT['format']) if isinstance(_format, Literal): if not imt and '/' in _format: - imt = unicode(_format) + imt = str(_format) else: - label = unicode(_format) + label = str(_format) elif isinstance(_format, (BNode, URIRef)): if self._object(_format, RDF.type) == DCT.IMT: if not imt: - imt = unicode(self.g.value(_format, default=None)) - label = unicode(self.g.label(_format, default=None)) + imt = str(self.g.value(_format, default=None)) + label = str(self.g.label(_format, default=None)) elif isinstance(_format, URIRef): # If the URIRef does not reference a BNode, it could reference an IANA type. # Otherwise, use it as label. - format_uri = unicode(_format) + format_uri = str(_format) if 'iana.org/assignments/media-types' in format_uri and not imt: imt = format_uri else: @@ -612,7 +617,7 @@ def _add_list_triple(self, subject, predicate, value, _type=Literal): try: # JSON list items = json.loads(value) - if isinstance(items, ((int, long, float, complex))): + if isinstance(items, ((int, int, float, complex))): items = [items] except ValueError: if ',' in value: @@ -684,7 +689,7 @@ def _without_mailto(self, mail_addr): Ensures that the mail address string has no mailto: prefix. ''' if mail_addr: - return unicode(mail_addr).replace(PREFIX_MAILTO, u'') + return str(mail_addr).replace(PREFIX_MAILTO, u'') else: return mail_addr @@ -895,7 +900,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): 'value': spatial.get(key)}) # Dataset URI (explicitly show the missing ones) - dataset_uri = (unicode(dataset_ref) + dataset_uri = (str(dataset_ref) if isinstance(dataset_ref, rdflib.term.URIRef) else '') dataset_dict['extras'].append({'key': 'uri', 'value': dataset_uri}) @@ -975,7 +980,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): resource_dict['hash'] = checksum_value # Distribution URI (explicitly show the missing ones) - resource_dict['uri'] = (unicode(distribution) + resource_dict['uri'] = (str(distribution) if isinstance(distribution, rdflib.term.URIRef) else '') @@ -1001,7 +1006,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): g = self.g - for prefix, namespace in namespaces.iteritems(): + for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) @@ -1258,7 +1263,7 @@ def graph_from_catalog(self, catalog_dict, catalog_ref): g = self.g - for prefix, namespace in namespaces.iteritems(): + for prefix, namespace in namespaces.items(): g.bind(prefix, namespace) g.add((catalog_ref, RDF.type, DCAT.Catalog)) diff --git a/ckanext/dcat/tests/test_base_parser.py b/ckanext/dcat/tests/test_base_parser.py index 5386dfe4..0bf21487 100644 --- a/ckanext/dcat/tests/test_base_parser.py +++ b/ckanext/dcat/tests/test_base_parser.py @@ -1,3 +1,5 @@ +from builtins import str +from builtins import object import nose from ckantoolkit import config diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py index 0266c0a8..916967b7 100644 --- a/ckanext/dcat/tests/test_base_profile.py +++ b/ckanext/dcat/tests/test_base_profile.py @@ -1,3 +1,5 @@ +from builtins import str +from builtins import object import nose from rdflib import Graph, URIRef, Literal @@ -103,7 +105,7 @@ def test_object_value(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, unicode) + assert isinstance(value, str) eq_(value, 'Test Dataset 1') def test_object_value_not_found(self): @@ -127,7 +129,7 @@ def test_object_value_default_lang(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, unicode) + assert isinstance(value, str) eq_(value, 'Test Datensatz 1') @helpers.change_config('ckan.locale_default', 'fr') @@ -140,7 +142,7 @@ def test_object_value_default_lang_not_in_graph(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, unicode) + assert isinstance(value, str) # FR is not in graph, so either node may be used assert value.startswith('Test D') assert value.endswith(' 1') @@ -156,7 +158,7 @@ def test_object_value_default_lang_fallback(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, unicode) + assert isinstance(value, str) # without config parameter, EN is used as default eq_(value, 'Test Dataset 1 (EN)') @@ -166,7 +168,7 @@ def test_object_value_default_lang_missing_lang_param(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, unicode) + assert isinstance(value, str) eq_(value, 'Test Dataset 1') def test_object_int(self): @@ -234,7 +236,7 @@ def test_object_list(self): DCAT.keyword) assert isinstance(value, list) - assert isinstance(value[0], unicode) + assert isinstance(value[0], str) eq_(len(value), 2) eq_(sorted(value), ['moon', 'space']) diff --git a/ckanext/dcat/tests/test_controllers.py b/ckanext/dcat/tests/test_controllers.py index 34eac667..06fa11a1 100644 --- a/ckanext/dcat/tests/test_controllers.py +++ b/ckanext/dcat/tests/test_controllers.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +from builtins import str +from builtins import range import time import nose @@ -35,7 +37,7 @@ def teardown(self): def _object_value(self, graph, subject, predicate): objects = [o for o in graph.objects(subject, predicate)] - return unicode(objects[0]) if objects else None + return str(objects[0]) if objects else None def test_dataset_default(self): @@ -254,7 +256,7 @@ def test_dataset_form_is_rendered(self): def test_catalog_default(self): - for i in xrange(4): + for i in range(4): factories.Dataset() url = url_for('dcat_catalog', _format='rdf') @@ -278,7 +280,7 @@ def test_catalog_default(self): def test_catalog_ttl(self): - for i in xrange(4): + for i in range(4): factories.Dataset() url = url_for('dcat_catalog', _format='ttl') @@ -390,7 +392,7 @@ def test_catalog_fq_filter(self): @helpers.change_config('ckanext.dcat.datasets_per_page', 10) def test_catalog_pagination(self): - for i in xrange(12): + for i in range(12): factories.Dataset() app = self._get_test_app() @@ -424,7 +426,7 @@ def test_catalog_pagination(self): @helpers.change_config('ckanext.dcat.datasets_per_page', 10) def test_catalog_pagination_parameters(self): - for i in xrange(12): + for i in range(12): factories.Dataset() app = self._get_test_app() diff --git a/ckanext/dcat/tests/test_converters.py b/ckanext/dcat/tests/test_converters.py index c3f692e9..dde782c8 100644 --- a/ckanext/dcat/tests/test_converters.py +++ b/ckanext/dcat/tests/test_converters.py @@ -1,3 +1,4 @@ +from builtins import object import os import json import difflib diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py index 27da63f9..4bc123e0 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_parse.py @@ -1,3 +1,5 @@ +from builtins import str +from builtins import object import os import json @@ -754,7 +756,7 @@ def test_parse_subcatalog(self): datasets = dict([(d['title'], d) for d in p.datasets()]) for subdataset, subcatalog in subdatasets: - title = unicode(list(p.g.objects(subdataset, DCT.title))[0]) + title = str(list(p.g.objects(subdataset, DCT.title))[0]) dataset = datasets[title] has_subcat = False for ex in dataset['extras']: @@ -762,7 +764,7 @@ def test_parse_subcatalog(self): exkey = ex['key'] if exkey == 'source_catalog_homepage': has_subcat = True - eq_(exval, unicode(subcatalog)) + eq_(exval, str(subcatalog)) # check if we had subcatalog in extras assert_true(has_subcat) diff --git a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py index ed91f6f7..c3be98bd 100644 --- a/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py +++ b/ckanext/dcat/tests/test_euro_dcatap_profile_serialize.py @@ -1,3 +1,5 @@ +from builtins import str +from builtins import object import json import nose @@ -129,7 +131,7 @@ def test_graph_from_dataset(self): dataset_ref = s.graph_from_dataset(dataset) - eq_(unicode(dataset_ref), utils.dataset_uri(dataset)) + eq_(str(dataset_ref), utils.dataset_uri(dataset)) # Basic fields assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset) @@ -266,7 +268,7 @@ def test_contact_details_extras(self): contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2] assert contact_details - eq_(unicode(contact_details), extras['contact_uri']) + eq_(str(contact_details), extras['contact_uri']) assert self._triple(g, contact_details, VCARD.fn, extras['contact_name']) assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + extras['contact_email'])) @@ -358,7 +360,7 @@ def test_publisher_extras(self): publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2] assert publisher - eq_(unicode(publisher), extras['publisher_uri']) + eq_(str(publisher), extras['publisher_uri']) assert self._triple(g, publisher, RDF.type, FOAF.Organization) assert self._triple(g, publisher, FOAF.name, extras['publisher_name']) @@ -453,7 +455,7 @@ def test_spatial(self): spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] assert spatial - eq_(unicode(spatial), extras['spatial_uri']) + eq_(str(spatial), extras['spatial_uri']) assert self._triple(g, spatial, RDF.type, DCT.Location) assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) @@ -597,7 +599,7 @@ def test_distribution_fields(self): # URI distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] - eq_(unicode(distribution), utils.resource_uri(resource)) + eq_(str(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, DCAT.Distribution) @@ -1037,7 +1039,7 @@ def test_graph_from_catalog(self): catalog = s.graph_from_catalog() - eq_(unicode(catalog), utils.catalog_uri()) + eq_(str(catalog), utils.catalog_uri()) # Basic fields assert self._triple(g, catalog, RDF.type, DCAT.Catalog) @@ -1059,7 +1061,7 @@ def test_graph_from_catalog_dict(self): catalog = s.graph_from_catalog(catalog_dict) - eq_(unicode(catalog), utils.catalog_uri()) + eq_(str(catalog), utils.catalog_uri()) # Basic fields assert self._triple(g, catalog, RDF.type, DCAT.Catalog) @@ -1082,7 +1084,7 @@ def test_graph_from_catalog_dict_language_uri_ref(self): catalog = s.graph_from_catalog(catalog_dict) - eq_(unicode(catalog), utils.catalog_uri()) + eq_(str(catalog), utils.catalog_uri()) # language field assert self._triple(g, catalog, DCT.language, URIRef(catalog_dict['language'])) @@ -1096,7 +1098,7 @@ def test_graph_from_catalog_modified_date(self): catalog = s.graph_from_catalog() - eq_(unicode(catalog), utils.catalog_uri()) + eq_(str(catalog), utils.catalog_uri()) assert self._triple(g, catalog, DCT.modified, dataset['metadata_modified'], XSD.dateTime) @@ -1152,4 +1154,4 @@ def test_subcatalog(self): dataset_ref = dataset_ref[0] dataset_title = list(g.objects(dataset_ref, DCT.title)) assert_true(len(dataset_title) == 1) - assert_true(unicode(dataset_title[0]) == dataset['title']) + assert_true(str(dataset_title[0]) == dataset['title']) diff --git a/ckanext/dcat/tests/test_harvester.py b/ckanext/dcat/tests/test_harvester.py index fa686c56..9b0b15ec 100644 --- a/ckanext/dcat/tests/test_harvester.py +++ b/ckanext/dcat/tests/test_harvester.py @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- +from builtins import str +from builtins import range +from builtins import object from collections import defaultdict import nose @@ -576,7 +579,7 @@ def _run_jobs(self, harvest_source_id=None): def _gather_queue(self, num_jobs=1): - for job in xrange(num_jobs): + for job in range(num_jobs): # Pop one item off the queue (the job id) and run the callback reply = self.gather_consumer.basic_get( queue='ckan.harvest.gather.test') @@ -590,7 +593,7 @@ def _gather_queue(self, num_jobs=1): def _fetch_queue(self, num_objects=1): - for _object in xrange(num_objects): + for _object in range(num_objects): # Pop item from the fetch queues (object ids) and run the callback, # one for each object created reply = self.fetch_consumer.basic_get( diff --git a/ckanext/dcat/tests/test_json_harvester.py b/ckanext/dcat/tests/test_json_harvester.py index a172be0d..190e1bd0 100644 --- a/ckanext/dcat/tests/test_json_harvester.py +++ b/ckanext/dcat/tests/test_json_harvester.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from builtins import object import httpretty from mock import call, patch, Mock @@ -213,7 +214,7 @@ def test_harvest_does_not_create_with_invalid_tags(self): exp_num_datasets=0) -class TestCopyAcrossResourceIds: +class TestCopyAcrossResourceIds(object): def test_copied_because_same_uri(self): harvested_dataset = {'resources': [ {'uri': 'http://abc', 'url': 'http://abc'}]} @@ -274,24 +275,24 @@ def test_not_copied_because_completely_different(self): eq_(harvested_dataset['resources'][0].get('id'), None) -class TestImportStage: +class TestImportStage(object): @classmethod def setup_class(cls): h.reset_db() - class MockHarvestObject: + class MockHarvestObject(object): guid = 'test_guid' content = TestDCATJSONHarvestFunctional.json_content_invalid_tags - class MockStatus: + class MockStatus(object): key = 'status' value = 'new' extras = [MockStatus()] package = None - class MockSource: + class MockSource(object): id = 'test_id' source = MockSource() @@ -299,7 +300,7 @@ class MockSource: def add(self): pass - class MockSourceDataset: + class MockSourceDataset(object): def __init__(self, owner_org=None): self.owner_org = owner_org['id'] diff --git a/ckanext/dcat/tests/test_logic.py b/ckanext/dcat/tests/test_logic.py index 81f18cd3..b6f66eed 100644 --- a/ckanext/dcat/tests/test_logic.py +++ b/ckanext/dcat/tests/test_logic.py @@ -1,3 +1,5 @@ +from builtins import range +from builtins import object import nose import mock @@ -32,7 +34,7 @@ def test_pagination(self, mock_request): # No page defined (defaults to 1) query = { 'count': 12, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': None @@ -52,7 +54,7 @@ def test_pagination(self, mock_request): # Page 1 query = { 'count': 12, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': 1 @@ -72,7 +74,7 @@ def test_pagination(self, mock_request): # Page 2 query = { 'count': 12, - 'results': [x for x in xrange(2)], + 'results': [x for x in range(2)], } data_dict = { 'page': 2 @@ -92,7 +94,7 @@ def test_pagination(self, mock_request): # Page 3 query = { 'count': 12, - 'results': [x for x in xrange(2)], + 'results': [x for x in range(2)], } data_dict = { 'page': 3 @@ -121,7 +123,7 @@ def test_pagination_less_results_than_page_size(self, mock_request): # No page defined (defaults to 1) query = { 'count': 12, - 'results': [x for x in xrange(12)], + 'results': [x for x in range(12)], } data_dict = { 'page': None @@ -150,7 +152,7 @@ def test_pagination_same_results_than_page_size(self, mock_request): # No page defined (defaults to 1) query = { 'count': 10, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': None @@ -179,7 +181,7 @@ def test_pagination_keeps_only_supported_params(self, mock_request): # No page defined (defaults to 1) query = { 'count': 12, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': None @@ -208,7 +210,7 @@ def test_pagination_without_site_url(self, mock_request): # No page defined (defaults to 1) query = { 'count': 12, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': None @@ -241,7 +243,7 @@ def test_pagination_no_results_empty_dict(self): def test_pagination_wrong_page(self): query = { 'count': 10, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': 'a' @@ -253,7 +255,7 @@ def test_pagination_wrong_page(self): def test_pagination_wrong_page_number(self): query = { 'count': 10, - 'results': [x for x in xrange(10)], + 'results': [x for x in range(10)], } data_dict = { 'page': '-1' diff --git a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py index fb986db9..ba5f0635 100644 --- a/ckanext/dcat/tests/test_schemaorg_profile_serialize.py +++ b/ckanext/dcat/tests/test_schemaorg_profile_serialize.py @@ -1,3 +1,4 @@ +from builtins import str import json import nose @@ -62,7 +63,7 @@ def test_graph_from_dataset(self): dataset_ref = s.graph_from_dataset(dataset) - eq_(unicode(dataset_ref), utils.dataset_uri(dataset)) + eq_(str(dataset_ref), utils.dataset_uri(dataset)) # Basic fields assert self._triple(g, dataset_ref, RDF.type, SCHEMA.Dataset) @@ -121,7 +122,7 @@ def test_publisher_extras(self): publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2] assert publisher - eq_(unicode(publisher), extras['publisher_uri']) + eq_(str(publisher), extras['publisher_uri']) assert self._triple(g, publisher, RDF.type, SCHEMA.Organization) assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name']) @@ -266,7 +267,7 @@ def test_spatial(self): spatial = self._triple(g, dataset_ref, SCHEMA.spatialCoverage, None)[2] assert spatial - eq_(unicode(spatial), extras['spatial_uri']) + eq_(str(spatial), extras['spatial_uri']) assert self._triple(g, spatial, RDF.type, SCHEMA.Place) assert self._triple(g, spatial, SCHEMA.description, extras['spatial_text']) geo = self._triple(g, spatial, SCHEMA.geo, None)[2] @@ -349,7 +350,7 @@ def test_distribution_fields(self): # URI distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] - eq_(unicode(distribution), utils.resource_uri(resource)) + eq_(str(distribution), utils.resource_uri(resource)) # Basic fields assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload) diff --git a/ckanext/dcat/tests/test_utils.py b/ckanext/dcat/tests/test_utils.py index 27b8f7f0..2c920806 100644 --- a/ckanext/dcat/tests/test_utils.py +++ b/ckanext/dcat/tests/test_utils.py @@ -1,3 +1,4 @@ +from builtins import object import nose from ckanext.dcat.utils import parse_accept_header diff --git a/ckanext/dcat/utils.py b/ckanext/dcat/utils.py index a6101ac1..c5a20351 100644 --- a/ckanext/dcat/utils.py +++ b/ckanext/dcat/utils.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from builtins import str import logging import uuid import json @@ -333,10 +334,10 @@ def parse_accept_header(accept_header=''): accepted_media_types = dict((value, key) for key, value - in CONTENT_TYPES.iteritems()) + in CONTENT_TYPES.items()) accepted_media_types_wildcard = {} - for media_type, _format in accepted_media_types.iteritems(): + for media_type, _format in accepted_media_types.items(): _type = media_type.split('/')[0] if _type not in accepted_media_types_wildcard: accepted_media_types_wildcard[_type] = _format @@ -349,7 +350,7 @@ def parse_accept_header(accept_header=''): qscore = m.groups(0)[2] or 1.0 acceptable[key] = float(qscore) - for media_type in sorted(acceptable.iteritems(), + for media_type in sorted(iter(acceptable.items()), key=operator.itemgetter(1), reverse=True): diff --git a/requirements.txt b/requirements.txt index 663e65dc..b376bb83 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ rdflib==4.2.1 rdflib-jsonld==0.4.0 geomet>=0.2.0 ckantoolkit==0.0.3 +future>=0.18.2 From ec030627b1b2761e3b57b98b8144a0844f7f1972 Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Thu, 16 Jan 2020 14:31:03 +0200 Subject: [PATCH 04/21] config from ckantoolkit --- ckanext/dcat/plugins/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index ecf5c065..706d4259 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -3,7 +3,7 @@ from builtins import object import os -from pylons import config +from ckantoolkit import config from ckan import plugins as p try: From b86256bfc710f11b9b21668cd52f5533baff5a6f Mon Sep 17 00:00:00 2001 From: Sergey Motornyuk Date: Thu, 16 Jan 2020 15:25:31 +0200 Subject: [PATCH 05/21] fix dict.itmes --- ckanext/dcat/plugins/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/plugins/__init__.py b/ckanext/dcat/plugins/__init__.py index 706d4259..7c32f458 100644 --- a/ckanext/dcat/plugins/__init__.py +++ b/ckanext/dcat/plugins/__init__.py @@ -110,7 +110,7 @@ def after_show(self, context, data_dict): field_labels = utils.field_labels() def set_titles(object_dict): - for key, value in object_dict.items(): + for key, value in list(object_dict.items()): if key in field_labels: object_dict[field_labels[key]] = object_dict[key] del object_dict[key] From 8b35a386d4e465db777d0888dbc9891ba2078c38 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 3 Feb 2020 15:50:28 +0100 Subject: [PATCH 06/21] Rename views to blueprints, fix routes registration --- ckanext/dcat/{views.py => blueprints.py} | 18 +++++++++++++----- ckanext/dcat/plugins/flask_plugin.py | 6 +++--- 2 files changed, 16 insertions(+), 8 deletions(-) rename ckanext/dcat/{views.py => blueprints.py} (72%) diff --git a/ckanext/dcat/views.py b/ckanext/dcat/blueprints.py similarity index 72% rename from ckanext/dcat/views.py rename to ckanext/dcat/blueprints.py index 3f76684b..44ab05d2 100644 --- a/ckanext/dcat/views.py +++ b/ckanext/dcat/blueprints.py @@ -3,20 +3,25 @@ from ckantoolkit import config +from ckan.views.dataset import CreateView + import ckan.plugins.toolkit as toolkit import ckanext.dcat.utils as utils -dcat = Blueprint('dcat', __name__) +dcat = Blueprint( + 'dcat', + __name__, + url_defaults={u'package_type': u'dataset'} +) -def read_catalog(_format=None): +def read_catalog(_format=None, package_type=None): return utils.read_catalog_page(_format) -def read_dataset(_id, _format): +def read_dataset(_id, _format=None, package_type=None): return utils.read_dataset_page(_id, _format) - if toolkit.asbool(config.get(utils.ENABLE_RDF_ENDPOINTS_CONFIG, True)): # requirements={'_format': 'xml|rdf|n3|ttl|jsonld'} @@ -27,7 +32,10 @@ def read_dataset(_id, _format): dcat.add_url_rule('/dataset/<_id>.<_format>', view_func=read_dataset) if toolkit.asbool(config.get(utils.ENABLE_CONTENT_NEGOTIATION_CONFIG)): - dcat.add_url_rule('/', view_func=read_catalog, endpoint="home.index") + dcat.add_url_rule('/', view_func=read_catalog) + + dcat.add_url_rule('/dataset/new', view_func=CreateView.as_view(str(u'new'))) + dcat.add_url_rule('/dataset/<_id>', view_func=read_dataset) dcat_json_interface = Blueprint('dcat_json_interface', __name__) diff --git a/ckanext/dcat/plugins/flask_plugin.py b/ckanext/dcat/plugins/flask_plugin.py index 68abe71b..7b1b6130 100644 --- a/ckanext/dcat/plugins/flask_plugin.py +++ b/ckanext/dcat/plugins/flask_plugin.py @@ -3,7 +3,7 @@ import ckan.plugins as p import ckanext.dcat.cli as cli -import ckanext.dcat.views as views +import ckanext.dcat.blueprints as blueprints class MixinDCATPlugin(p.SingletonPlugin): @@ -18,7 +18,7 @@ def get_commands(self): # IBlueprint def get_blueprint(self): - return [views.dcat] + return [blueprints.dcat] class MixinDCATJSONInterface(p.SingletonPlugin): @@ -27,4 +27,4 @@ class MixinDCATJSONInterface(p.SingletonPlugin): # IBlueprint def get_blueprint(self): - return [views.dcat_json_interface] + return [blueprints.dcat_json_interface] From d4d6f6bcef9233ee6a78e5090385dc7e8ea0e4e5 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 3 Feb 2020 23:19:45 +0100 Subject: [PATCH 07/21] New pytest based test suite To run against CKAN 2.9. Old nose tests for CKAN <= 2.8 have been moved to the nose folder --- README.md | 9 +- ckanext/dcat/tests/__init__.py | 12 - ckanext/dcat/tests/nose/__init__.py | 12 + ckanext/dcat/tests/nose/test_base_parser.py | 276 +++ ckanext/dcat/tests/nose/test_base_profile.py | 409 +++++ .../dcat/tests/{ => nose}/test_controllers.py | 2 +- ckanext/dcat/tests/nose/test_converters.py | 51 + .../nose/test_euro_dcatap_profile_parse.py | 1051 ++++++++++++ .../test_euro_dcatap_profile_serialize.py | 1157 +++++++++++++ ckanext/dcat/tests/nose/test_harvester.py | 1505 +++++++++++++++++ .../dcat/tests/nose/test_json_harvester.py | 327 ++++ ckanext/dcat/tests/nose/test_logic.py | 309 ++++ .../nose/test_schemaorg_profile_serialize.py | 576 +++++++ ckanext/dcat/tests/nose/test_utils.py | 129 ++ ckanext/dcat/tests/test_base_parser.py | 45 +- ckanext/dcat/tests/test_base_profile.py | 106 +- ckanext/dcat/tests/test_blueprints.py | 583 +++++++ ckanext/dcat/tests/test_converters.py | 64 +- .../tests/test_euro_dcatap_profile_parse.py | 361 ++-- .../test_euro_dcatap_profile_serialize.py | 89 +- ckanext/dcat/tests/test_harvester.py | 353 ++-- ckanext/dcat/tests/test_json_harvester.py | 63 +- ckanext/dcat/tests/test_logic.py | 237 ++- .../tests/test_schemaorg_profile_serialize.py | 35 +- ckanext/dcat/tests/test_utils.py | 127 +- conftest.py | 6 + setup.cfg | 3 + test-nose.ini | 56 + test.ini | 6 +- 29 files changed, 7176 insertions(+), 783 deletions(-) create mode 100644 ckanext/dcat/tests/nose/__init__.py create mode 100644 ckanext/dcat/tests/nose/test_base_parser.py create mode 100644 ckanext/dcat/tests/nose/test_base_profile.py rename ckanext/dcat/tests/{ => nose}/test_controllers.py (99%) create mode 100644 ckanext/dcat/tests/nose/test_converters.py create mode 100644 ckanext/dcat/tests/nose/test_euro_dcatap_profile_parse.py create mode 100644 ckanext/dcat/tests/nose/test_euro_dcatap_profile_serialize.py create mode 100644 ckanext/dcat/tests/nose/test_harvester.py create mode 100644 ckanext/dcat/tests/nose/test_json_harvester.py create mode 100644 ckanext/dcat/tests/nose/test_logic.py create mode 100644 ckanext/dcat/tests/nose/test_schemaorg_profile_serialize.py create mode 100644 ckanext/dcat/tests/nose/test_utils.py create mode 100644 ckanext/dcat/tests/test_blueprints.py create mode 100644 conftest.py create mode 100644 test-nose.ini diff --git a/README.md b/README.md index 66a74d2a..94c2df3a 100644 --- a/README.md +++ b/README.md @@ -910,9 +910,14 @@ Example output of structured data in JSON-LD: ## Running the Tests -To run the tests, do: +To run the tests on CKAN >= 2.9, do: - nosetests --nologcapture --ckan --with-pylons=test.ini ckanext + pytest --ckan-ini=test.ini ckanext/dcat/tests + + +To run the tests on CKAN <= 2.8, do: + + nosetests --nologcapture --ckan --with-pylons=test-nose.ini ckanext/dcat/tests/nose ## Releases diff --git a/ckanext/dcat/tests/__init__.py b/ckanext/dcat/tests/__init__.py index 8fbc68a1..e69de29b 100644 --- a/ckanext/dcat/tests/__init__.py +++ b/ckanext/dcat/tests/__init__.py @@ -1,12 +0,0 @@ -from ckan.tests.helpers import FunctionalTestBase - -from ckanext.harvest.model import setup as harvest_setup - - -class DCATFunctionalTestBase(FunctionalTestBase): - - def setup(self): - - super(DCATFunctionalTestBase, self).setup() - - harvest_setup() diff --git a/ckanext/dcat/tests/nose/__init__.py b/ckanext/dcat/tests/nose/__init__.py new file mode 100644 index 00000000..8fbc68a1 --- /dev/null +++ b/ckanext/dcat/tests/nose/__init__.py @@ -0,0 +1,12 @@ +from ckan.tests.helpers import FunctionalTestBase + +from ckanext.harvest.model import setup as harvest_setup + + +class DCATFunctionalTestBase(FunctionalTestBase): + + def setup(self): + + super(DCATFunctionalTestBase, self).setup() + + harvest_setup() diff --git a/ckanext/dcat/tests/nose/test_base_parser.py b/ckanext/dcat/tests/nose/test_base_parser.py new file mode 100644 index 00000000..0bf21487 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_base_parser.py @@ -0,0 +1,276 @@ +from builtins import str +from builtins import object +import nose + +from ckantoolkit import config + +from rdflib import Graph, URIRef, Literal +from rdflib.namespace import Namespace, RDF + +from ckanext.dcat.processors import ( + RDFParser, + RDFParserException, + RDFProfileException, + DEFAULT_RDF_PROFILES, + RDF_PROFILES_CONFIG_OPTION +) + +from ckanext.dcat.profiles import RDFProfile + +DCT = Namespace("http://purl.org/dc/terms/") +DCAT = Namespace("http://www.w3.org/ns/dcat#") + +eq_ = nose.tools.eq_ + + +def _default_graph(): + + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + g.add((dataset1, DCT.title, Literal('Test Dataset 1'))) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + distribution1_2 = URIRef("http://example.org/datasets/1/ds/2") + g.add((distribution1_2, RDF.type, DCAT.Distribution)) + + g.add((dataset1, DCAT.distribution, distribution1_1)) + g.add((dataset1, DCAT.distribution, distribution1_2)) + + dataset2 = URIRef("http://example.org/datasets/2") + g.add((dataset2, RDF.type, DCAT.Dataset)) + g.add((dataset2, DCT.title, Literal('Test Dataset 2'))) + + distribution2_1 = URIRef("http://example.org/datasets/2/ds/1") + g.add((distribution2_1, RDF.type, DCAT.Distribution)) + g.add((dataset2, DCAT.distribution, distribution2_1)) + + dataset3 = URIRef("http://example.org/datasets/3") + g.add((dataset3, RDF.type, DCAT.Dataset)) + g.add((dataset3, DCT.title, Literal('Test Dataset 3'))) + + return g + + +class MockRDFProfile1(RDFProfile): + + def parse_dataset(self, dataset_dict, dataset_ref): + + dataset_dict['profile_1'] = True + + return dataset_dict + + +class MockRDFProfile2(RDFProfile): + + def parse_dataset(self, dataset_dict, dataset_ref): + + dataset_dict['profile_2'] = True + + return dataset_dict + + +class TestRDFParser(object): + + def test_default_profile(self): + + p = RDFParser() + + eq_(sorted([pr.name for pr in p._profiles]), + sorted(DEFAULT_RDF_PROFILES)) + + def test_profiles_via_config_option(self): + + original_config = config.copy() + + config[RDF_PROFILES_CONFIG_OPTION] = 'profile_conf_1 profile_conf_2' + try: + RDFParser() + except RDFProfileException as e: + + eq_(str(e), 'Unknown RDF profiles: profile_conf_1, profile_conf_2') + + config.clear() + config.update(original_config) + + def test_no_profile_provided(self): + try: + RDFParser(profiles=[]) + except RDFProfileException as e: + + eq_(str(e), 'No suitable RDF profiles could be loaded') + + def test_profile_not_found(self): + try: + RDFParser(profiles=['not_found']) + except RDFProfileException as e: + + eq_(str(e), 'Unknown RDF profiles: not_found') + + def test_profiles_are_called_on_datasets(self): + + p = RDFParser() + + p._profiles = [MockRDFProfile1, MockRDFProfile2] + + p.g = _default_graph() + + for dataset in p.datasets(): + assert dataset['profile_1'] + assert dataset['profile_2'] + + def test_parse_data(self): + + data = ''' + + + Some label + + + ''' + + p = RDFParser() + + eq_(len(p.g), 0) + + p.parse(data) + + eq_(len(p.g), 2) + + def test_parse_pagination_next_page(self): + + data = ''' + + + 245 + http://example.com/catalog.xml?page=3 + 100 + http://example.com/catalog.xml?page=2 + http://example.com/catalog.xml?page=1 + + + ''' + + p = RDFParser() + + p.parse(data) + + eq_(p.next_page(), 'http://example.com/catalog.xml?page=2') + + def test_parse_without_pagination(self): + + data = ''' + + + Some label + + + ''' + + p = RDFParser() + + p.parse(data) + + eq_(p.next_page(), None) + + def test_parse_pagination_last_page(self): + + data = ''' + + + 245 + http://example.com/catalog.xml?page=3 + 100 + http://example.com/catalog.xml?page=1 + http://example.com/catalog.xml?page=2 + + + ''' + + p = RDFParser() + + p.parse(data) + + eq_(p.next_page(), None) + + def test_parse_data_different_format(self): + + data = ''' + @prefix rdf: . + @prefix rdfs: . + + a rdfs:SomeClass ; + rdfs:label "Some label" . + ''' + + p = RDFParser() + + eq_(len(p.g), 0) + + p.parse(data, _format='n3') + + eq_(len(p.g), 2) + + def test_parse_data_raises_on_parse_error(self): + + p = RDFParser() + + data = 'Wrong data' + + nose.tools.assert_raises(RDFParserException, p.parse, '') + + nose.tools.assert_raises(RDFParserException, p.parse, data) + + nose.tools.assert_raises(RDFParserException, p.parse, data, + _format='n3',) + + def test__datasets(self): + + p = RDFParser() + + p.g = _default_graph() + + eq_(len([d for d in p._datasets()]), 3) + + def test__datasets_none_found(self): + + p = RDFParser() + + p.g = Graph() + + eq_(len([d for d in p._datasets()]), 0) + + def test_datasets(self): + + p = RDFParser() + + p.g = _default_graph() + + datasets = [] + for dataset in p.datasets(): + + assert 'title' in dataset + + datasets.append(dataset) + + eq_(len(datasets), 3) + + def test_datasets_none_found(self): + + p = RDFParser() + + p.g = Graph() + + eq_(len([d for d in p.datasets()]), 0) diff --git a/ckanext/dcat/tests/nose/test_base_profile.py b/ckanext/dcat/tests/nose/test_base_profile.py new file mode 100644 index 00000000..5eae55c9 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_base_profile.py @@ -0,0 +1,409 @@ +from builtins import str +from builtins import object +import nose + +from rdflib import Graph, URIRef, Literal +from rdflib.namespace import Namespace + +from ckantoolkit.tests import helpers + +from ckanext.dcat.profiles import RDFProfile, CleanedURIRef + +from ckanext.dcat.tests.nose.test_base_parser import _default_graph + + +eq_ = nose.tools.eq_ + +DCT = Namespace("http://purl.org/dc/terms/") +TEST = Namespace("http://test.org/") +DCAT = Namespace("http://www.w3.org/ns/dcat#") +ADMS = Namespace("http://www.w3.org/ns/adms#") + + +class TestURIRefPreprocessing(object): + + def test_with_valid_items(self): + testUriPart = "://www.w3.org/ns/dcat#" + + for prefix in ['http', 'https']: + eq_(CleanedURIRef(prefix + testUriPart), URIRef(prefix + testUriPart)) + # leading and trailing whitespace should be removed + eq_(CleanedURIRef(' ' + prefix + testUriPart + ' '), URIRef(prefix + testUriPart)) + + testNonHttpUri = "mailto:someone@example.com" + eq_(CleanedURIRef(testNonHttpUri), URIRef(testNonHttpUri)) + # leading and trailing whitespace should be removed again + eq_(CleanedURIRef(' ' + testNonHttpUri + ' '), URIRef(testNonHttpUri)) + + def test_with_invalid_items(self): + testUriPart = "://www.w3.org/ns/!dcat #" + expectedUriPart = "://www.w3.org/ns/%21dcat%20#" + + for prefix in ['http', 'https']: + eq_(CleanedURIRef(prefix + testUriPart), URIRef(prefix + expectedUriPart)) + # applying on escaped data should have no effect + eq_(CleanedURIRef(prefix + expectedUriPart), URIRef(prefix + expectedUriPart)) + + # leading and trailing space should not be escaped + testNonHttpUri = " mailto:with space!@example.com " + expectedNonHttpUri = "mailto:with%20space%21@example.com" + + eq_(CleanedURIRef(testNonHttpUri), URIRef(expectedNonHttpUri)) + # applying on escaped data should have no effect + eq_(CleanedURIRef(expectedNonHttpUri), URIRef(expectedNonHttpUri)) + + +class TestBaseRDFProfile(object): + + def test_datasets(self): + + p = RDFProfile(_default_graph()) + + eq_(len([d for d in p._datasets()]), 3) + + def test_datasets_none_found(self): + + p = RDFProfile(Graph()) + + eq_(len([d for d in p._datasets()]), 0) + + def test_distributions(self): + + p = RDFProfile(_default_graph()) + + for dataset in p._datasets(): + if str(dataset) == 'http://example.org/datasets/1': + eq_(len([d for d in p._distributions(dataset)]), 2) + elif str(dataset) == 'http://example.org/datasets/2': + eq_(len([d for d in p._distributions(dataset)]), 1) + elif str(dataset) == 'http://example.org/datasets/3': + eq_(len([d for d in p._distributions(dataset)]), 0) + + def test_object(self): + + p = RDFProfile(_default_graph()) + + _object = p._object(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(_object, Literal) + eq_(str(_object), 'Test Dataset 1') + + def test_object_not_found(self): + + p = RDFProfile(_default_graph()) + + _object = p._object(URIRef('http://example.org/datasets/1'), + DCT.unknown_property) + + eq_(_object, None) + + def test_object_value(self): + + p = RDFProfile(_default_graph()) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, str) + eq_(value, 'Test Dataset 1') + + def test_object_value_not_found(self): + + p = RDFProfile(_default_graph()) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.unknown_property) + + eq_(value, '') + + @helpers.change_config('ckan.locale_default', 'de') + def test_object_value_default_lang(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, str) + eq_(value, 'Test Datensatz 1') + + @helpers.change_config('ckan.locale_default', 'fr') + def test_object_value_default_lang_not_in_graph(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, str) + # FR is not in graph, so either node may be used + assert value.startswith('Test D') + assert value.endswith(' 1') + + def test_object_value_default_lang_fallback(self): + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Datensatz 1', lang='de'))) + p.g.add((URIRef('http://example.org/datasets/1'), + DCT.title, Literal('Test Dataset 1 (EN)', lang='en'))) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, str) + # without config parameter, EN is used as default + eq_(value, 'Test Dataset 1 (EN)') + + def test_object_value_default_lang_missing_lang_param(self): + p = RDFProfile(_default_graph()) + + value = p._object_value(URIRef('http://example.org/datasets/1'), + DCT.title) + + assert isinstance(value, str) + eq_(value, 'Test Dataset 1') + + def test_object_int(self): + + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + TEST.some_number, + Literal('23'))) + + value = p._object_value_int(URIRef('http://example.org/datasets/1'), + TEST.some_number) + + assert isinstance(value, int) + eq_(value, 23) + + def test_object_int_decimal(self): + + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + TEST.some_number, + Literal('23.0'))) + + value = p._object_value_int(URIRef('http://example.org/datasets/1'), + TEST.some_number) + + assert isinstance(value, int) + eq_(value, 23) + + def test_object_int_not_found(self): + + p = RDFProfile(_default_graph()) + + value = p._object_value_int(URIRef('http://example.org/datasets/1'), + TEST.some_number) + + eq_(value, None) + + def test_object_int_wrong_value(self): + + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + TEST.some_number, + Literal('Not an intger'))) + + value = p._object_value_int(URIRef('http://example.org/datasets/1'), + TEST.some_number) + + eq_(value, None) + + def test_object_list(self): + + p = RDFProfile(_default_graph()) + + p.g.add((URIRef('http://example.org/datasets/1'), + DCAT.keyword, + Literal('space'))) + p.g.add((URIRef('http://example.org/datasets/1'), + DCAT.keyword, + Literal('moon'))) + + value = p._object_value_list(URIRef('http://example.org/datasets/1'), + DCAT.keyword) + + assert isinstance(value, list) + assert isinstance(value[0], str) + eq_(len(value), 2) + eq_(sorted(value), ['moon', 'space']) + + def test_object_list_not_found(self): + + p = RDFProfile(_default_graph()) + + value = p._object_value_list(URIRef('http://example.org/datasets/1'), + TEST.some_list) + + assert isinstance(value, list) + eq_(value, []) + + def test_time_interval_schema_org(self): + + data = ''' + + + + + 1905-03-01 + 2013-01-05 + + + + + ''' + + g = Graph() + + g.parse(data=data) + + p = RDFProfile(g) + + start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) + + eq_(start, '1905-03-01') + eq_(end, '2013-01-05') + + def test_time_interval_w3c_time(self): + + data = ''' + + + + + + + 1904 + + + + + 2014-03-22 + + + + + + + ''' + + g = Graph() + + g.parse(data=data) + + p = RDFProfile(g) + + start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) + + eq_(start, '1904-01-01') + eq_(end, '2014-03-22') + + def test_publisher_foaf(self): + + data = ''' + + + + + Publishing Organization for dataset 1 + contact@some.org + http://some.org + + + + + + ''' + + g = Graph() + + g.parse(data=data) + + p = RDFProfile(g) + + publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) + + eq_(publisher['uri'], 'http://orgs.vocab.org/some-org') + eq_(publisher['name'], 'Publishing Organization for dataset 1') + eq_(publisher['email'], 'contact@some.org') + eq_(publisher['url'], 'http://some.org') + eq_(publisher['type'], 'http://purl.org/adms/publishertype/NonProfitOrganisation') + + def test_publisher_ref(self): + + data = ''' + + + + + + ''' + + g = Graph() + + g.parse(data=data) + + p = RDFProfile(g) + + publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) + + eq_(publisher['uri'], 'http://orgs.vocab.org/some-org') + + def test_contact_details(self): + + data = ''' + + + + + Point of Contact + + + + + + ''' + + g = Graph() + + g.parse(data=data) + + p = RDFProfile(g) + + contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint) + + eq_(contact['name'], 'Point of Contact') + # mailto gets removed for storage and is added again on output + eq_(contact['email'], 'contact@some.org') diff --git a/ckanext/dcat/tests/test_controllers.py b/ckanext/dcat/tests/nose/test_controllers.py similarity index 99% rename from ckanext/dcat/tests/test_controllers.py rename to ckanext/dcat/tests/nose/test_controllers.py index 06fa11a1..d76d177f 100644 --- a/ckanext/dcat/tests/test_controllers.py +++ b/ckanext/dcat/tests/nose/test_controllers.py @@ -17,7 +17,7 @@ from ckanext.dcat.profiles import RDF, DCAT from ckanext.dcat.processors import HYDRA -from ckanext.dcat.tests import DCATFunctionalTestBase +from ckanext.dcat.tests.nose import DCATFunctionalTestBase eq_ = nose.tools.eq_ assert_true = nose.tools.assert_true diff --git a/ckanext/dcat/tests/nose/test_converters.py b/ckanext/dcat/tests/nose/test_converters.py new file mode 100644 index 00000000..dde782c8 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_converters.py @@ -0,0 +1,51 @@ +from builtins import object +import os +import json +import difflib + +from ckanext.dcat import converters + + +class TestConverters(object): + + def _get_file_as_dict(self, file_name): + path = os.path.join(os.path.dirname(__file__), + '..', '..', '..', 'examples', + file_name) + with open(path, 'r') as f: + return json.load(f) + + def _poor_mans_dict_diff(self, d1, d2): + def _get_lines(d): + return sorted([l.strip().rstrip(',') + for l in json.dumps(d, indent=0).split('\n') + if not l.startswith(('{', '}', '[', ']'))]) + + d1_lines = _get_lines(d1) + d2_lines = _get_lines(d2) + + return '\n' + '\n'.join([l for l in difflib.ndiff(d1_lines, d2_lines) + if l.startswith(('-', '+'))]) + + def test_ckan_to_dcat(self): + ckan_dict = self._get_file_as_dict('full_ckan_dataset.json') + expected_dcat_dict = self._get_file_as_dict('dataset.json') + + dcat_dict = converters.ckan_to_dcat(ckan_dict) + + assert dcat_dict == expected_dcat_dict, self._poor_mans_dict_diff( + expected_dcat_dict, dcat_dict) + + def test_dcat_to_ckan(self): + dcat_dict = self._get_file_as_dict('dataset.json') + expected_ckan_dict = self._get_file_as_dict('ckan_dataset.json') + + # Pop CKAN specific fields + expected_ckan_dict.pop('id', None) + expected_ckan_dict['resources'][0].pop('id', None) + expected_ckan_dict['resources'][0].pop('package_id', None) + + ckan_dict = converters.dcat_to_ckan(dcat_dict) + + assert ckan_dict == expected_ckan_dict, self._poor_mans_dict_diff( + expected_ckan_dict, ckan_dict) diff --git a/ckanext/dcat/tests/nose/test_euro_dcatap_profile_parse.py b/ckanext/dcat/tests/nose/test_euro_dcatap_profile_parse.py new file mode 100644 index 00000000..4bc123e0 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_euro_dcatap_profile_parse.py @@ -0,0 +1,1051 @@ +from builtins import str +from builtins import object +import os +import json + +import nose + +from ckantoolkit import config + +from rdflib import Graph, URIRef, BNode, Literal +from rdflib.namespace import RDF + +from ckan.plugins import toolkit + +from ckantoolkit.tests import helpers, factories + +from ckanext.dcat.processors import RDFParser, RDFSerializer +from ckanext.dcat.profiles import (DCAT, DCT, ADMS, LOCN, SKOS, GSP, RDFS, + GEOJSON_IMT) +from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS, DCAT_CLEAN_TAGS + +eq_ = nose.tools.eq_ +assert_true = nose.tools.assert_true + + +class BaseParseTest(object): + + def _extras(self, dataset): + extras = {} + for extra in dataset.get('extras'): + extras[extra['key']] = extra['value'] + return extras + + def _get_file_contents(self, file_name): + path = os.path.join(os.path.dirname(__file__), + '..', '..', '..', 'examples', + file_name) + with open(path, 'r') as f: + return f.read() + + +class TestEuroDCATAPProfileParsing(BaseParseTest): + + def _build_and_parse_format_mediatype_graph(self, format_item=None, mediatype_item=None): + """ + Creates a minimal graph with a distribution having the specified dct:format and dcat:mediaType + nodes. At least one of those nodes has to be given. + + After creating the graph, it is parsed using the euro_dcat_ap profile. + + :param format_item: + Literal or URIRef object for dct:format. None if the node should be omitted. + :param mediatype_item: + Literal or URIRef object for dcat:mediaType. None if the node should be omitted. + + :returns: + The parsed resource dict + """ + g = Graph() + + dataset = URIRef("http://example.org/datasets/1") + g.add((dataset, RDF.type, DCAT.Dataset)) + + distribution = URIRef("http://example.org/datasets/1/ds/1") + g.add((dataset, DCAT.distribution, distribution)) + g.add((distribution, RDF.type, DCAT.Distribution)) + if format_item: + g.add((distribution, DCT['format'], format_item)) + if mediatype_item: + g.add((distribution, DCAT.mediaType, mediatype_item)) + if format_item is None and mediatype_item is None: + raise AssertionError('At least one of format or mediaType is required!') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + dataset = [d for d in p.datasets()][0] + return dataset.get('resources') + + def test_dataset_all_fields(self): + + contents = self._get_file_contents('dataset.rdf') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.parse(contents) + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 1) + + dataset = datasets[0] + + # Basic fields + + eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.') + eq_(dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...') + eq_(dataset['url'], 'http://dataset.info.org') + eq_(dataset['version'], '2.3') + eq_(dataset['license_id'], 'cc-nc') + + # Tags + + eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{'name': u'exploration'}, + {'name': u'geochemistry'}, + {'name': u'geology'}]) + # Extras + + def _get_extra_value(key): + v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key] + return v[0] if v else None + + def _get_extra_value_as_list(key): + value = _get_extra_value(key) + return json.loads(value) if value else [] + + # Simple values + eq_(_get_extra_value('issued'), u'2012-05-10') + eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00') + eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98') + eq_(_get_extra_value('version_notes'), u'New schema added') + eq_(_get_extra_value('temporal_start'), '1905-03-01') + eq_(_get_extra_value('temporal_end'), '2013-01-05') + eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily') + eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE') + eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org') + eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1') + eq_(_get_extra_value('publisher_email'), 'contact@some.org') + eq_(_get_extra_value('publisher_url'), 'http://some.org') + eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation') + eq_(_get_extra_value('contact_name'), 'Point of Contact') + # mailto gets removed for storage and is added again on output + eq_(_get_extra_value('contact_email'), 'contact@some.org') + eq_(_get_extra_value('access_rights'), 'public') + eq_(_get_extra_value('provenance'), 'Some statement about provenance') + eq_(_get_extra_value('dcat_type'), 'test-type') + + # Lists + eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en', u'es']) + eq_(sorted(_get_extra_value_as_list('theme')), [u'Earth Sciences', + u'http://eurovoc.europa.eu/100142', + u'http://eurovoc.europa.eu/209065']) + eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2']) + + eq_(sorted(_get_extra_value_as_list('alternate_identifier')), [u'alternate-identifier-1', u'alternate-identifier-2']) + eq_(sorted(_get_extra_value_as_list('documentation')), [u'http://dataset.info.org/doc1', + u'http://dataset.info.org/doc2']) + eq_(sorted(_get_extra_value_as_list('related_resource')), [u'http://dataset.info.org/related1', + u'http://dataset.info.org/related2']) + eq_(sorted(_get_extra_value_as_list('has_version')), [u'https://data.some.org/catalog/datasets/derived-dataset-1', + u'https://data.some.org/catalog/datasets/derived-dataset-2']) + eq_(sorted(_get_extra_value_as_list('is_version_of')), [u'https://data.some.org/catalog/datasets/original-dataset']) + eq_(sorted(_get_extra_value_as_list('source')), [u'https://data.some.org/catalog/datasets/source-dataset-1', + u'https://data.some.org/catalog/datasets/source-dataset-2']) + eq_(sorted(_get_extra_value_as_list('sample')), [u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample']) + + # Dataset URI + eq_(_get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98') + + # Resources + eq_(len(dataset['resources']), 1) + + resource = dataset['resources'][0] + + # Simple values + eq_(resource['name'], u'Some website') + eq_(resource['description'], u'A longer description') + eq_(resource['format'], u'HTML') + eq_(resource['mimetype'], u'text/html') + eq_(resource['issued'], u'2012-05-11') + eq_(resource['modified'], u'2012-05-01T00:04:06') + eq_(resource['status'], u'http://purl.org/adms/status/Completed') + + eq_(resource['hash'], u'4304cf2e751e6053c90b1804c89c0ebb758f395a') + eq_(resource['hash_algorithm'], u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1') + + # Lists + for item in [ + ('documentation', [u'http://dataset.info.org/distribution1/doc1', u'http://dataset.info.org/distribution1/doc2']), + ('language', [u'ca', u'en', u'es']), + ('conforms_to', [u'Standard 1', u'Standard 2']), + ]: + eq_(sorted(json.loads(resource[item[0]])), item[1]) + + # These two are likely to need clarification + eq_(resource['license'], u'http://creativecommons.org/licenses/by-nc/2.0/') + eq_(resource['rights'], u'Some statement about rights') + + eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html') + assert 'download_url' not in resource + + eq_(resource['size'], 12323) + + # Distribution URI + eq_(resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1') + + # owl:versionInfo is tested on the test above + def test_dataset_version_adms(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + g.add((dataset1, ADMS.version, Literal('2.3a'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + dataset = [d for d in p.datasets()][0] + + eq_(dataset['version'], u'2.3a') + + def test_dataset_license_from_distribution_by_uri(self): + # license_id retrieved from the URI of dcat:license object + g = Graph() + + dataset = URIRef("http://example.org/datasets/1") + g.add((dataset, RDF.type, DCAT.Dataset)) + + distribution = URIRef("http://example.org/datasets/1/ds/1") + g.add((dataset, DCAT.distribution, distribution)) + g.add((distribution, RDF.type, DCAT.Distribution)) + g.add((distribution, DCT.license, + URIRef("http://www.opendefinition.org/licenses/cc-by"))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + dataset = [d for d in p.datasets()][0] + eq_(dataset['license_id'], 'cc-by') + + def test_dataset_license_from_distribution_by_title(self): + # license_id retrieved from dct:title of dcat:license object + g = Graph() + + dataset = URIRef("http://example.org/datasets/1") + g.add((dataset, RDF.type, DCAT.Dataset)) + + distribution = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution, RDF.type, DCAT.Distribution)) + g.add((dataset, DCAT.distribution, distribution)) + license = BNode() + g.add((distribution, DCT.license, license)) + g.add((license, DCT.title, Literal("Creative Commons Attribution"))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + dataset = [d for d in p.datasets()][0] + eq_(dataset['license_id'], 'cc-by') + + def test_distribution_access_url(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.accessURL, Literal('http://access.url.org'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['url'], u'http://access.url.org') + eq_(resource['access_url'], u'http://access.url.org') + assert 'download_url' not in resource + + def test_distribution_download_url(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.downloadURL, Literal('http://download.url.org'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['url'], u'http://download.url.org') + eq_(resource['download_url'], u'http://download.url.org') + assert 'access_url' not in resource + + def test_distribution_both_access_and_download_url(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.accessURL, Literal('http://access.url.org'))) + g.add((distribution1_1, DCAT.downloadURL, Literal('http://download.url.org'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['url'], u'http://download.url.org') + eq_(resource['download_url'], u'http://download.url.org') + eq_(resource['access_url'], u'http://access.url.org') + + def test_distribution_format_imt_and_format(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) + g.add((distribution1_1, DCT['format'], Literal('CSV'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'CSV') + eq_(resource['mimetype'], u'text/csv') + + def test_distribution_format_format_only(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCT['format'], Literal('CSV'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'CSV') + + def test_distribution_format_imt_only(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + if toolkit.check_ckan_version(min_version='2.3'): + eq_(resource['format'], u'CSV') + eq_(resource['mimetype'], u'text/csv') + else: + eq_(resource['format'], u'text/csv') + + @helpers.change_config('ckanext.dcat.normalize_ckan_format', False) + def test_distribution_format_imt_only_normalize_false(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'text/csv') + eq_(resource['mimetype'], u'text/csv') + + @helpers.change_config('ckanext.dcat.normalize_ckan_format', False) + def test_distribution_format_format_only_normalize_false(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCT['format'], Literal('CSV'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'CSV') + assert 'mimetype' not in resource + + def test_distribution_format_unknown_imt(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/unknown-imt'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'text/unknown-imt') + eq_(resource['mimetype'], u'text/unknown-imt') + + def test_distribution_format_imt_normalized(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/unknown-imt'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'text/unknown-imt') + eq_(resource['mimetype'], u'text/unknown-imt') + + def test_distribution_format_format_normalized(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) + g.add((distribution1_1, DCT['format'], Literal('Comma Separated Values'))) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + if toolkit.check_ckan_version(min_version='2.3'): + eq_(resource['format'], u'CSV') + eq_(resource['mimetype'], u'text/csv') + else: + eq_(resource['format'], u'Comma Separated Values') + + def test_distribution_format_IMT_field(self): + g = Graph() + + dataset1 = URIRef("http://example.org/datasets/1") + g.add((dataset1, RDF.type, DCAT.Dataset)) + + distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") + + imt = BNode() + + g.add((imt, RDF.type, DCT.IMT)) + g.add((imt, RDF.value, Literal('text/turtle'))) + g.add((imt, RDFS.label, Literal('Turtle'))) + + g.add((distribution1_1, RDF.type, DCAT.Distribution)) + g.add((distribution1_1, DCT['format'], imt)) + g.add((dataset1, DCAT.distribution, distribution1_1)) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + resource = datasets[0]['resources'][0] + + eq_(resource['format'], u'Turtle') + eq_(resource['mimetype'], u'text/turtle') + + def test_distribution_dct_format_iana_uri(self): + resources = self._build_and_parse_format_mediatype_graph( + format_item=URIRef("https://www.iana.org/assignments/media-types/application/json") + ) + # IANA mediatype URI should be added to mimetype field as well + assert_true(u'json' in resources[0].get('format').lower()) + eq_(u'https://www.iana.org/assignments/media-types/application/json', + resources[0].get('mimetype')) + + def test_distribution_mediatype_iana_uri_without_format(self): + resources = self._build_and_parse_format_mediatype_graph( + mediatype_item=URIRef("https://www.iana.org/assignments/media-types/application/json") + ) + # IANA mediatype URI should be added to mimetype field and to format as well + eq_(u'https://www.iana.org/assignments/media-types/application/json', + resources[0].get('mimetype')) + eq_(u'https://www.iana.org/assignments/media-types/application/json', + resources[0].get('format')) + + def test_distribution_dct_format_other_uri(self): + resources = self._build_and_parse_format_mediatype_graph( + format_item=URIRef("https://example.com/my/format") + ) + eq_(u'https://example.com/my/format', + resources[0].get('format')) + eq_(None, resources[0].get('mimetype')) + + def test_distribution_dct_format_mediatype_text(self): + resources = self._build_and_parse_format_mediatype_graph( + format_item=Literal("application/json") + ) + # IANA mediatype should be added to mimetype field as well + assert_true(u'json' in resources[0].get('format').lower()) + eq_(u'application/json', + resources[0].get('mimetype')) + + def test_distribution_format_and_dcat_mediatype(self): + # Even if dct:format is a valid IANA type, prefer dcat:mediaType if given + resources = self._build_and_parse_format_mediatype_graph( + format_item=Literal("application/json"), + mediatype_item=Literal("test-mediatype") + ) + # both should be stored + assert_true(u'json' in resources[0].get('format').lower()) + eq_(u'test-mediatype', + resources[0].get('mimetype')) + + def test_catalog_xml_rdf(self): + + contents = self._get_file_contents('catalog.rdf') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.parse(contents) + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 2) + + dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1' + else datasets[1]) + + eq_(dataset['title'], 'Example dataset 1') + eq_(len(dataset['resources']), 3) + eq_(len(dataset['tags']), 2) + + def test_dataset_turtle_1(self): + + contents = self._get_file_contents('dataset_deri.ttl') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.parse(contents, _format='n3') + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 1) + + dataset = datasets[0] + + eq_(dataset['title'], 'Abandoned Vehicles') + eq_(len(dataset['resources']), 1) + + resource = dataset['resources'][0] + eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles') + eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv') + eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv') + + def test_dataset_json_ld_1(self): + + contents = self._get_file_contents('catalog_pod.jsonld') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.parse(contents, _format='json-ld') + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 1) + + dataset = datasets[0] + extras = dict((e['key'], e['value']) for e in dataset['extras']) + + eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics') + + eq_(extras['contact_name'], 'Jane Doe') + # mailto gets removed for storage and is added again on output + eq_(extras['contact_email'], 'jane.doe@agency.gov') + eq_(extras['publisher_name'], 'Widget Services') + eq_(extras['publisher_email'], 'widget.services@agency.gov') + + eq_(len(dataset['resources']), 4) + + resource = [r for r in dataset['resources'] if r['name'] == 'widgets.csv'][0] + eq_(resource['name'], u'widgets.csv') + eq_(resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') + eq_(resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') + + def test_dataset_json_ld_with_at_graph(self): + + contents = self._get_file_contents('catalog_with_at_graph.jsonld') + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.parse(contents, _format='json-ld') + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 1) + + dataset = datasets[0] + extras = dict((e['key'], e['value']) for e in dataset['extras']) + + eq_(dataset['title'], 'Title dataset') + + eq_(extras['contact_name'], 'Jane Doe') + # mailto gets removed for storage and is added again on output + eq_(extras['contact_email'], 'jane.doe@agency.gov') + + eq_(len(dataset['resources']), 1) + + resource = dataset['resources'][0] + eq_(resource['name'], u'download.zip') + eq_(resource['url'], u'http://example2.org/files/download.zip') + eq_(resource['access_url'], u'https://ckan.example.org/dataset/d4ce4e6e-ab89-44cb-bf5c-33a162c234de/resource/a289c289-55c9-410f-b4c7-f88e5f6f4e47') + eq_(resource['download_url'], u'http://example2.org/files/download.zip') + + def test_dataset_compatibility_mode(self): + + contents = self._get_file_contents('dataset.rdf') + + p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True) + + p.parse(contents) + + datasets = [d for d in p.datasets()] + + eq_(len(datasets), 1) + + dataset = datasets[0] + + def _get_extra_value(key): + v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key] + return v[0] if v else None + + eq_(_get_extra_value('dcat_issued'), u'2012-05-10') + eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00') + eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1') + eq_(_get_extra_value('dcat_publisher_email'), 'contact@some.org') + eq_(_get_extra_value('language'), 'ca,en,es') + + @helpers.change_config(DCAT_EXPOSE_SUBCATALOGS, 'true') + def test_parse_subcatalog(self): + publisher = {'name': 'Publisher', + 'email': 'email@test.com', + 'type': 'Publisher', + 'uri': 'http://pub.lish.er'} + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'test dataset', + 'extras': [ + {'key': 'source_catalog_title', 'value': 'Subcatalog example'}, + {'key': 'source_catalog_homepage', 'value': 'http://subcatalog.example'}, + {'key': 'source_catalog_description', 'value': 'Subcatalog example description'}, + {'key': 'source_catalog_language', 'value': 'http://publications.europa.eu/resource/authority/language/ITA'}, + {'key': 'source_catalog_modified', 'value': '2000-01-01'}, + {'key': 'source_catalog_publisher', 'value': json.dumps(publisher)} + ] + } + catalog_dict = { + 'title': 'My Catalog', + 'description': 'An Open Data Catalog', + 'homepage': 'http://example.com', + 'language': 'de', + } + + s = RDFSerializer() + s.serialize_catalog(catalog_dict, dataset_dicts=[dataset]) + g = s.g + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + # at least one subcatalog with hasPart + subcatalogs = list(p.g.objects(None, DCT.hasPart)) + assert_true(subcatalogs) + + # at least one dataset in subcatalogs + subdatasets = [] + for subcatalog in subcatalogs: + datasets = p.g.objects(subcatalog, DCAT.dataset) + for dataset in datasets: + subdatasets.append((dataset,subcatalog,)) + assert_true(subdatasets) + + datasets = dict([(d['title'], d) for d in p.datasets()]) + + for subdataset, subcatalog in subdatasets: + title = str(list(p.g.objects(subdataset, DCT.title))[0]) + dataset = datasets[title] + has_subcat = False + for ex in dataset['extras']: + exval = ex['value'] + exkey = ex['key'] + if exkey == 'source_catalog_homepage': + has_subcat = True + eq_(exval, str(subcatalog)) + # check if we had subcatalog in extras + assert_true(has_subcat) + + +class TestEuroDCATAPProfileParsingSpatial(BaseParseTest): + + def test_spatial_multiple_dct_spatial_instances(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + location_ref = BNode() + g.add((location_ref, RDF.type, DCT.Location)) + g.add((dataset, DCT.spatial, location_ref)) + g.add((location_ref, + LOCN.geometry, + Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) + + location_ref = BNode() + g.add((location_ref, RDF.type, DCT.Location)) + g.add((dataset, DCT.spatial, location_ref)) + g.add((location_ref, SKOS.prefLabel, Literal('Newark'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial_uri'], 'http://geonames/Newark') + eq_(extras['spatial_text'], 'Newark') + eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}') + + def test_spatial_one_dct_spatial_instance(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + g.add((spatial_uri, RDF.type, DCT.Location)) + g.add((spatial_uri, + LOCN.geometry, + Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) + g.add((spatial_uri, SKOS.prefLabel, Literal('Newark'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial_uri'], 'http://geonames/Newark') + eq_(extras['spatial_text'], 'Newark') + eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}') + + def test_spatial_one_dct_spatial_instance_no_uri(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + location_ref = BNode() + g.add((dataset, DCT.spatial, location_ref)) + + g.add((location_ref, RDF.type, DCT.Location)) + g.add((location_ref, + LOCN.geometry, + Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) + g.add((location_ref, SKOS.prefLabel, Literal('Newark'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + assert_true('spatial_uri' not in extras) + eq_(extras['spatial_text'], 'Newark') + eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}') + + + def test_spatial_rdfs_label(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + g.add((spatial_uri, RDF.type, DCT.Location)) + g.add((spatial_uri, RDFS.label, Literal('Newark'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial_text'], 'Newark') + + def test_spatial_both_geojson_and_wkt(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + g.add((spatial_uri, RDF.type, DCT.Location)) + g.add((spatial_uri, + LOCN.geometry, + Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) + g.add((spatial_uri, + LOCN.geometry, + Literal('POINT (67 89)', datatype=GSP.wktLiteral))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}') + + def test_spatial_wkt_only(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + g.add((spatial_uri, RDF.type, DCT.Location)) + g.add((spatial_uri, + LOCN.geometry, + Literal('POINT (67 89)', datatype=GSP.wktLiteral))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + # NOTE: geomet returns floats for coordinates on WKT -> GeoJSON + eq_(extras['spatial'], '{"type": "Point", "coordinates": [67.0, 89.0]}') + + def test_spatial_wrong_geometries(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + + g.add((spatial_uri, RDF.type, DCT.Location)) + g.add((spatial_uri, + LOCN.geometry, + Literal('Not GeoJSON', datatype=GEOJSON_IMT))) + g.add((spatial_uri, + LOCN.geometry, + Literal('Not WKT', datatype=GSP.wktLiteral))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + assert_true('spatial' not in extras) + + def test_spatial_literal_only(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + g.add((dataset, DCT.spatial, Literal('Newark'))) + + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial_text'], 'Newark') + assert_true('spatial_uri' not in extras) + assert_true('spatial' not in extras) + + def test_spatial_uri_only(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + + spatial_uri = URIRef('http://geonames/Newark') + g.add((dataset, DCT.spatial, spatial_uri)) + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + extras = self._extras(datasets[0]) + + eq_(extras['spatial_uri'], 'http://geonames/Newark') + assert_true('spatial_text' not in extras) + assert_true('spatial' not in extras) + + def test_tags_with_commas(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + g.add((dataset, DCAT.keyword, Literal('Tree, forest, shrub'))) + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + eq_(len(datasets[0]['tags']), 3) + + INVALID_TAG = "Som`E-in.valid tag!;" + VALID_TAG = {'name': 'some-invalid-tag'} + + @helpers.change_config(DCAT_CLEAN_TAGS, 'true') + def test_tags_with_commas_clean_tags_on(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG))) + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + datasets = [d for d in p.datasets()] + + assert_true(self.VALID_TAG in datasets[0]['tags']) + assert_true(self.INVALID_TAG not in datasets[0]['tags']) + + + @helpers.change_config(DCAT_CLEAN_TAGS, 'false') + def test_tags_with_commas_clean_tags_off(self): + g = Graph() + + dataset = URIRef('http://example.org/datasets/1') + g.add((dataset, RDF.type, DCAT.Dataset)) + g.add((dataset, DCAT.keyword, Literal(self.INVALID_TAG))) + p = RDFParser(profiles=['euro_dcat_ap']) + + p.g = g + + # when config flag is set to false, bad tags can happen + + datasets = [d for d in p.datasets()] + assert_true(self.VALID_TAG not in datasets[0]['tags']) + assert_true({'name': self.INVALID_TAG} in datasets[0]['tags']) diff --git a/ckanext/dcat/tests/nose/test_euro_dcatap_profile_serialize.py b/ckanext/dcat/tests/nose/test_euro_dcatap_profile_serialize.py new file mode 100644 index 00000000..c3be98bd --- /dev/null +++ b/ckanext/dcat/tests/nose/test_euro_dcatap_profile_serialize.py @@ -0,0 +1,1157 @@ +from builtins import str +from builtins import object +import json + +import nose + +from ckantoolkit import config + +from dateutil.parser import parse as parse_date +from rdflib import URIRef, BNode, Literal +from rdflib.namespace import RDF + +from geomet import wkt + +from ckantoolkit.tests import helpers, factories + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import (DCAT, DCT, ADMS, XSD, VCARD, FOAF, SCHEMA, + SKOS, LOCN, GSP, OWL, SPDX, GEOJSON_IMT) +from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS + +eq_ = nose.tools.eq_ +assert_true = nose.tools.assert_true + + +class BaseSerializeTest(object): + + def _extras(self, dataset): + extras = {} + for extra in dataset.get('extras'): + extras[extra['key']] = extra['value'] + return extras + + def _triples(self, graph, subject, predicate, _object, data_type=None): + + if not (isinstance(_object, URIRef) or isinstance(_object, BNode) or _object is None): + if data_type: + _object = Literal(_object, datatype=data_type) + else: + _object = Literal(_object) + triples = [t for t in graph.triples((subject, predicate, _object))] + return triples + + def _triple(self, graph, subject, predicate, _object, data_type=None): + triples = self._triples(graph, subject, predicate, _object, data_type) + return triples[0] if triples else None + + +class TestEuroDCATAPProfileSerializeDataset(BaseSerializeTest): + def _build_graph_and_check_format_mediatype(self, dataset_dict, expected_format, expected_mediatype): + """ + Creates a graph based on the given dict and checks for dct:format and dct:mediaType in the + first resource element. + + :param dataset_dict: + dataset dict, expected to contain one resource + :param expected_format: + expected list of dct:format items in the resource + :param expected_mediatype: + expected list of dcat:mediaType items in the resource + """ + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset_dict) + + # graph should contain the expected nodes + resource_ref = list(g.objects(dataset_ref, DCAT.distribution))[0] + dct_format = list(g.objects(resource_ref, DCT['format'])) + dcat_mediatype = list(g.objects(resource_ref, DCAT.mediaType)) + eq_(expected_format, dct_format) + eq_(expected_mediatype, dcat_mediatype) + + def _get_base_dataset_with_resource(self): + """ + Creates a minimal test dataset with one resource. The dataset and resource are + both returned and can be extended in test cases. + """ + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + return dataset, resource + + def test_graph_from_dataset(self): + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'notes': 'Lorem ipsum', + 'url': 'http://example.com/ds1', + 'version': '1.0b', + 'metadata_created': '2015-06-26T15:21:09.034694', + 'metadata_modified': '2015-06-26T15:21:09.075774', + 'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}], + 'extras': [ + {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'}, + {'key': 'version_notes', 'value': 'This is a beta version'}, + {'key': 'frequency', 'value': 'monthly'}, + {'key': 'language', 'value': '[\"en\", \"http://publications.europa.eu/resource/authority/language/ITA\"]'}, + {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'}, + {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'}, + {'key': 'access_rights', 'value': 'public'}, + {'key': 'documentation', 'value': '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'}, + {'key': 'provenance', 'value': 'Some statement about provenance'}, + {'key': 'dcat_type', 'value': 'test-type'}, + {'key': 'related_resource', 'value': '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'}, + {'key': 'has_version', 'value': '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'}, + {'key': 'is_version_of', 'value': '[\"https://data.some.org/catalog/datasets/original-dataset\"]'}, + {'key': 'source', 'value': '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'}, + {'key': 'sample', 'value': '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(str(dataset_ref), utils.dataset_uri(dataset)) + + # Basic fields + assert self._triple(g, dataset_ref, RDF.type, DCAT.Dataset) + assert self._triple(g, dataset_ref, DCT.title, dataset['title']) + assert self._triple(g, dataset_ref, DCT.description, dataset['notes']) + + assert self._triple(g, dataset_ref, OWL.versionInfo, dataset['version']) + assert self._triple(g, dataset_ref, ADMS.versionNotes, extras['version_notes']) + assert self._triple(g, dataset_ref, DCT.accrualPeriodicity, extras['frequency']) + assert self._triple(g, dataset_ref, DCT.accessRights, extras['access_rights']) + assert self._triple(g, dataset_ref, DCT.provenance, extras['provenance']) + assert self._triple(g, dataset_ref, DCT.type, extras['dcat_type']) + + # Tags + eq_(len([t for t in g.triples((dataset_ref, DCAT.keyword, None))]), 2) + for tag in dataset['tags']: + assert self._triple(g, dataset_ref, DCAT.keyword, tag['name']) + + # Dates + assert self._triple(g, dataset_ref, DCT.issued, dataset['metadata_created'], XSD.dateTime) + assert self._triple(g, dataset_ref, DCT.modified, dataset['metadata_modified'], XSD.dateTime) + + # List + for item in [ + ('language', DCT.language, [Literal, URIRef]), + ('theme', DCAT.theme, URIRef), + ('conforms_to', DCT.conformsTo, Literal), + ('alternate_identifier', ADMS.identifier, Literal), + ('documentation', FOAF.page, URIRef), + ('related_resource', DCT.relation, URIRef), + ('has_version', DCT.hasVersion, URIRef), + ('is_version_of', DCT.isVersionOf, URIRef), + ('source', DCT.source, Literal), + ('sample', ADMS.sample, Literal), + ]: + values = json.loads(extras[item[0]]) + eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values)) + for num, value in enumerate(values): + _type = item[2] + if isinstance(item[2], list): + eq_(len(item[2]), len(values)) + _type = item[2][num] + assert self._triple(g, dataset_ref, item[1], _type(value)) + + def test_identifier_extra(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'identifier', 'value': 'idxxx'}, + {'key': 'guid', 'value': 'guidyyy'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, DCT.identifier, extras['identifier']) + + def test_identifier_guid(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'guid', 'value': 'guidyyy'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, DCT.identifier, extras['guid']) + + def test_identifier_id(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, DCT.identifier, dataset['id']) + + def test_alternate_identifier_numeric(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'alternate_identifier', 'value': '1.0'}, + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, DCT.identifier, dataset['id']) + + def test_contact_details_extras(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'maintainer': 'Example Maintainer', + 'maintainer_email': 'dep@example.com', + 'author': 'Example Author', + 'author_email': 'ped@example.com', + 'extras': [ + {'key': 'contact_uri', 'value': 'http://example.com/contact'}, + {'key': 'contact_name', 'value': 'Example Contact'}, + {'key': 'contact_email', 'value': 'contact@example.com'}, + + ] + + + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + # Contact details + + contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2] + assert contact_details + eq_(str(contact_details), extras['contact_uri']) + assert self._triple(g, contact_details, VCARD.fn, extras['contact_name']) + assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + extras['contact_email'])) + + def test_contact_details_maintainer(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'maintainer': 'Example Maintainer', + 'maintainer_email': 'dep@example.com', + 'author': 'Example Author', + 'author_email': 'ped@example.com', + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2] + assert contact_details + assert_true(isinstance(contact_details, BNode)) + assert self._triple(g, contact_details, VCARD.fn, dataset['maintainer']) + assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['maintainer_email'])) + + def test_contact_details_author(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'author': 'Example Author', + 'author_email': 'ped@example.com', + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2] + assert contact_details + assert_true(isinstance(contact_details, BNode)) + assert self._triple(g, contact_details, VCARD.fn, dataset['author']) + assert self._triple(g, contact_details, VCARD.hasEmail, URIRef('mailto:' + dataset['author_email'])) + + def test_contact_details_no_duplicate_mailto(self): + # tests that mailto: isn't added again if it is stored in the dataset + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'author': 'Example Author', + 'author_email': 'mailto:ped@example.com', + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + contact_details = self._triple(g, dataset_ref, DCAT.contactPoint, None)[2] + assert contact_details + assert_true(isinstance(contact_details, BNode)) + assert self._triple(g, contact_details, VCARD.fn, dataset['author']) + assert self._triple(g, contact_details, VCARD.hasEmail, URIRef(dataset['author_email'])) + + def test_publisher_extras(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + }, + 'extras': [ + {'key': 'publisher_uri', 'value': 'http://example.com/publisher'}, + {'key': 'publisher_name', 'value': 'Example Publisher'}, + {'key': 'publisher_email', 'value': 'publisher@example.com'}, + {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'}, + {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'}, + ] + + + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2] + assert publisher + eq_(str(publisher), extras['publisher_uri']) + + assert self._triple(g, publisher, RDF.type, FOAF.Organization) + assert self._triple(g, publisher, FOAF.name, extras['publisher_name']) + assert self._triple(g, publisher, FOAF.mbox, extras['publisher_email']) + assert self._triple(g, publisher, FOAF.homepage, URIRef(extras['publisher_url'])) + assert self._triple(g, publisher, DCT.type, URIRef(extras['publisher_type'])) + + def test_publisher_org(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + } + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2] + assert publisher + + assert self._triple(g, publisher, RDF.type, FOAF.Organization) + assert self._triple(g, publisher, FOAF.name, dataset['organization']['title']) + + def test_publisher_no_uri(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'publisher_name', 'value': 'Example Publisher'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, DCT.publisher, None)[2] + assert publisher + assert_true(isinstance(publisher, BNode)) + + assert self._triple(g, publisher, RDF.type, FOAF.Organization) + assert self._triple(g, publisher, FOAF.name, extras['publisher_name']) + + def test_temporal(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'temporal_start', 'value': '2015-06-26T15:21:09.075774'}, + {'key': 'temporal_end', 'value': '2015-07-14'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + temporal = self._triple(g, dataset_ref, DCT.temporal, None)[2] + assert temporal + + assert self._triple(g, temporal, RDF.type, DCT.PeriodOfTime) + assert self._triple(g, temporal, SCHEMA.startDate, parse_date(extras['temporal_start']).isoformat(), XSD.dateTime) + assert self._triple(g, temporal, SCHEMA.endDate, parse_date(extras['temporal_end']).isoformat(), XSD.dateTime) + + def test_spatial(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial_uri', 'value': 'http://sws.geonames.org/6361390/'}, + {'key': 'spatial_text', 'value': 'Tarragona'}, + {'key': 'spatial', 'value': '{"type": "Polygon", "coordinates": [[[1.1870606,41.0786393],[1.1870606,41.1655218],[1.3752339,41.1655218],[1.3752339,41.0786393],[1.1870606,41.0786393]]]}'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] + assert spatial + eq_(str(spatial), extras['spatial_uri']) + assert self._triple(g, spatial, RDF.type, DCT.Location) + assert self._triple(g, spatial, SKOS.prefLabel, extras['spatial_text']) + + eq_(len([t for t in g.triples((spatial, LOCN.geometry, None))]), 2) + # Geometry in GeoJSON + assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) + + # Geometry in WKT + wkt_geom = wkt.dumps(json.loads(extras['spatial']), decimals=4) + assert self._triple(g, spatial, LOCN.geometry, wkt_geom, GSP.wktLiteral) + + def test_spatial_bad_geojson_no_wkt(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial', 'value': '{"key": "NotGeoJSON"}'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] + assert spatial + assert_true(isinstance(spatial, BNode)) + # Geometry in GeoJSON + assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) + + # Geometry in WKT + eq_(len([t for t in g.triples((spatial, LOCN.geometry, None))]), 1) + + def test_spatial_bad_json_no_wkt(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial', 'value': 'NotJSON'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, DCT.spatial, None)[2] + assert spatial + assert_true(isinstance(spatial, BNode)) + # Geometry in GeoJSON + assert self._triple(g, spatial, LOCN.geometry, extras['spatial'], GEOJSON_IMT) + + # Geometry in WKT + eq_(len([t for t in g.triples((spatial, LOCN.geometry, None))]), 1) + + def test_distributions(self): + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file' + }, + { + 'id': '8bceeda9-0084-477f-aa33-dad6148900d5', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'XLS file' + }, + { + 'id': 'da73d939-0f11-45a1-9733-5de108383133', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'PDF file' + }, + + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 3) + + for resource in dataset['resources']: + distribution = self._triple(g, + dataset_ref, + DCAT.distribution, + URIRef(utils.resource_uri(resource)))[2] + + assert self._triple(g, distribution, RDF.type, DCAT.Distribution) + assert self._triple(g, distribution, DCT.title, resource['name']) + + def test_distribution_fields(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'description': 'A CSV file', + 'url': 'http://example.com/data/file.csv', + 'status': 'http://purl.org/adms/status/Completed', + 'rights': 'Some statement about rights', + 'license': 'http://creativecommons.org/licenses/by/3.0/', + 'issued': '2015-06-26T15:21:09.034694', + 'modified': '2015-06-26T15:21:09.075774', + 'size': 1234, + 'documentation': '[\"http://dataset.info.org/distribution1/doc1\", \"http://dataset.info.org/distribution1/doc2\"]', + 'language': '[\"en\", \"es\", \"http://publications.europa.eu/resource/authority/language/ITA\"]', + 'conforms_to': '[\"Standard 1\", \"Standard 2\"]', + 'hash': '4304cf2e751e6053c90b1804c89c0ebb758f395a', + 'hash_algorithm': 'http://spdx.org/rdf/terms#checksumAlgorithm_sha1', + + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(len([t for t in g.triples((dataset_ref, DCAT.distribution, None))]), 1) + + # URI + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + eq_(str(distribution), utils.resource_uri(resource)) + + # Basic fields + assert self._triple(g, distribution, RDF.type, DCAT.Distribution) + assert self._triple(g, distribution, DCT.title, resource['name']) + assert self._triple(g, distribution, DCT.description, resource['description']) + assert self._triple(g, distribution, DCT.rights, resource['rights']) + assert self._triple(g, distribution, DCT.license, URIRef(resource['license'])) + assert self._triple(g, distribution, ADMS.status, URIRef(resource['status'])) + + # List + for item in [ + ('documentation', FOAF.page, URIRef), + ('language', DCT.language, [Literal, Literal, URIRef]), + ('conforms_to', DCT.conformsTo, Literal), + ]: + values = json.loads(resource[item[0]]) + eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) + for num, value in enumerate(values): + _type = item[2] + if isinstance(item[2], list): + eq_(len(item[2]), len(values)) + _type = item[2][num] + assert self._triple(g, distribution, item[1], _type(value)) + + # Dates + assert self._triple(g, distribution, DCT.issued, resource['issued'], XSD.dateTime) + assert self._triple(g, distribution, DCT.modified, resource['modified'], XSD.dateTime) + + # Numbers + assert self._triple(g, distribution, DCAT.byteSize, float(resource['size']), XSD.decimal) + + # Checksum + checksum = self._triple(g, distribution, SPDX.checksum, None)[2] + assert checksum + assert self._triple(g, checksum, RDF.type, SPDX.Checksum) + assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary') + assert self._triple(g, checksum, SPDX.algorithm, URIRef(resource['hash_algorithm'])) + + def test_distribution_size_not_number(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'size': 'aaaa', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.byteSize, resource['size']) + + def test_distribution_url_only(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef(resource['url'])) + assert self._triple(g, distribution, DCAT.downloadURL, None) is None + + def test_distribution_access_url_only(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'access_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef(resource['access_url'])) + assert self._triple(g, distribution, DCAT.downloadURL, None) is None + + def test_distribution_download_url_only(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['download_url'])) + assert self._triple(g, distribution, DCAT.accessURL, None) is None + + def test_distribution_both_urls_different(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef( resource['url'])) + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['download_url'])) + + def test_distribution_both_urls_different_with_access_url(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'access_url': 'http://example.com/data/file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef( resource['access_url'])) + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['download_url'])) + + def test_distribution_prefer_access_url(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data', + 'access_url': 'http://example.com/data/file', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef( resource['access_url'])) + assert self._triple(g, distribution, DCAT.downloadURL, None) is None + + def test_distribution_prefer_access_url_with_download(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data', + 'access_url': 'http://example.com/data/file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.accessURL, URIRef( resource['access_url'])) + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['download_url'])) + + def test_distribution_both_urls_the_same(self): + + # old behavior - only serialize url to accessURL if it is different from downloadURL + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['url'])) + assert self._triple(g, distribution, DCAT.accessURL, None) is None + + def test_distribution_both_urls_the_same_with_access_url(self): + + # when the access_url is present, it should be serialized regardless if it is the same as downloadURL. + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'access_url': 'http://example.com/data/file.csv', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + assert self._triple(g, distribution, DCAT.downloadURL, URIRef(resource['download_url'])) + assert self._triple(g, distribution, DCAT.accessURL, URIRef(resource['access_url'])) + + def test_distribution_format_iana_uri(self): + dataset_dict, resource = self._get_base_dataset_with_resource() + # when only format is available and it looks like an IANA media type, use DCAT.mediaType instead + # of DCT.format for output + fmt_uri = 'https://www.iana.org/assignments/media-types/application/json' + resource['format'] = fmt_uri + + # expect no dct:format node and the URI in dcat:mediaType + self._build_graph_and_check_format_mediatype( + dataset_dict, + [], + [URIRef(fmt_uri)] + ) + + def test_distribution_format_other_uri(self): + dataset_dict, resource = self._get_base_dataset_with_resource() + # when only format is available and it does not look like an IANA media type, use dct:format + fmt_uri = 'https://example.com/my/format' + resource['format'] = fmt_uri + + # expect dct:format node with the URI and no dcat:mediaType + self._build_graph_and_check_format_mediatype( + dataset_dict, + [URIRef(fmt_uri)], + [] + ) + + def test_distribution_format_mediatype_text(self): + dataset_dict, resource = self._get_base_dataset_with_resource() + # if format value looks like an IANA media type, output dcat:mediaType instead of dct:format + fmt_text = 'application/json' + resource['format'] = fmt_text + + # expect no dct:format node and the literal value in dcat:mediaType + self._build_graph_and_check_format_mediatype( + dataset_dict, + [], + [Literal(fmt_text)] + ) + + def test_distribution_format_mediatype_same(self): + dataset_dict, resource = self._get_base_dataset_with_resource() + # if format and mediaType are identical, output only dcat:mediaType + fmt_text = 'application/json' + resource['format'] = fmt_text + resource['mimetype'] = fmt_text + + # expect no dct:format node and the literal value in dcat:mediaType + self._build_graph_and_check_format_mediatype( + dataset_dict, + [], + [Literal(fmt_text)] + ) + + def test_distribution_format_mediatype_different(self): + dataset_dict, resource = self._get_base_dataset_with_resource() + # if format and mediaType are different, output both + resource['format'] = 'myformat' + resource['mimetype'] = 'application/json' + + # expect both nodes + self._build_graph_and_check_format_mediatype( + dataset_dict, + [Literal('myformat')], + [Literal('application/json')] + ) + + def test_hash_algorithm_not_uri(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'hash': 'aaaa', + 'hash_algorithm': 'sha1', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, DCAT.distribution, None)[2] + + checksum = self._triple(g, distribution, SPDX.checksum, None)[2] + assert checksum + assert self._triple(g, checksum, RDF.type, SPDX.Checksum) + assert self._triple(g, checksum, SPDX.checksumValue, resource['hash'], data_type='http://www.w3.org/2001/XMLSchema#hexBinary') + assert self._triple(g, checksum, SPDX.algorithm, resource['hash_algorithm']) + + +class TestEuroDCATAPProfileSerializeCatalog(BaseSerializeTest): + + @classmethod + def teardown_class(cls): + helpers.reset_db() + + def test_graph_from_catalog(self): + + s = RDFSerializer() + g = s.g + + catalog = s.graph_from_catalog() + + eq_(str(catalog), utils.catalog_uri()) + + # Basic fields + assert self._triple(g, catalog, RDF.type, DCAT.Catalog) + assert self._triple(g, catalog, DCT.title, config.get('ckan.site_title')) + assert self._triple(g, catalog, FOAF.homepage, URIRef(config.get('ckan.site_url'))) + assert self._triple(g, catalog, DCT.language, 'en') + + def test_graph_from_catalog_dict(self): + + catalog_dict = { + 'title': 'My Catalog', + 'description': 'An Open Data Catalog', + 'homepage': 'http://example.com', + 'language': 'de', + } + + s = RDFSerializer() + g = s.g + + catalog = s.graph_from_catalog(catalog_dict) + + eq_(str(catalog), utils.catalog_uri()) + + # Basic fields + assert self._triple(g, catalog, RDF.type, DCAT.Catalog) + assert self._triple(g, catalog, DCT.title, catalog_dict['title']) + assert self._triple(g, catalog, DCT.description, catalog_dict['description']) + assert self._triple(g, catalog, FOAF.homepage, URIRef(catalog_dict['homepage'])) + assert self._triple(g, catalog, DCT.language, catalog_dict['language']) + + def test_graph_from_catalog_dict_language_uri_ref(self): + + catalog_dict = { + 'title': 'My Catalog', + 'description': 'An Open Data Catalog', + 'homepage': 'http://example.com', + 'language': 'http://publications.europa.eu/resource/authority/language/ITA', + } + + s = RDFSerializer() + g = s.g + + catalog = s.graph_from_catalog(catalog_dict) + + eq_(str(catalog), utils.catalog_uri()) + + # language field + assert self._triple(g, catalog, DCT.language, URIRef(catalog_dict['language'])) + + def test_graph_from_catalog_modified_date(self): + + dataset = factories.Dataset() + + s = RDFSerializer() + g = s.g + + catalog = s.graph_from_catalog() + + eq_(str(catalog), utils.catalog_uri()) + + assert self._triple(g, catalog, DCT.modified, dataset['metadata_modified'], XSD.dateTime) + + @helpers.change_config(DCAT_EXPOSE_SUBCATALOGS, 'true') + def test_subcatalog(self): + publisher = {'name': 'Publisher', + 'email': 'email@test.com', + 'type': 'Publisher', + 'uri': 'http://pub.lish.er'} + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'test dataset', + 'extras': [ + {'key': 'source_catalog_title', 'value': 'Subcatalog example'}, + {'key': 'source_catalog_homepage', 'value': 'http://subcatalog.example'}, + {'key': 'source_catalog_description', 'value': 'Subcatalog example description'}, + {'key': 'source_catalog_language', 'value': 'http://publications.europa.eu/resource/authority/language/ITA'}, + {'key': 'source_catalog_modified', 'value': '2000-01-01'}, + {'key': 'source_catalog_publisher', 'value': json.dumps(publisher)} + ] + } + catalog_dict = { + 'title': 'My Catalog', + 'description': 'An Open Data Catalog', + 'homepage': 'http://example.com', + 'language': 'de', + } + + s = RDFSerializer() + g = s.g + + s.serialize_catalog(catalog_dict, dataset_dicts=[dataset]) + + # check if we have catalog->hasPart->subcatalog + catalogs = list(g.triples((None, RDF.type, DCAT.Catalog,))) + root = list(g.subjects(DCT.hasPart, None)) + assert_true(len(catalogs)>0, catalogs) + assert_true(len(root) == 1, root) + + root_ref = root[0] + + # check subcatalog + subcatalogs = list(g.objects(root_ref, DCT.hasPart)) + assert_true(len(subcatalogs) == 1) + stitle = list(g.objects(subcatalogs[0], DCT.title)) + assert_true(len(stitle) == 1) + assert_true(str(stitle[0]) == 'Subcatalog example') + + # check dataset + dataset_ref = list(g.subjects(RDF.type, DCAT.Dataset)) + assert_true(len(dataset_ref) == 1) + dataset_ref = dataset_ref[0] + dataset_title = list(g.objects(dataset_ref, DCT.title)) + assert_true(len(dataset_title) == 1) + assert_true(str(dataset_title[0]) == dataset['title']) diff --git a/ckanext/dcat/tests/nose/test_harvester.py b/ckanext/dcat/tests/nose/test_harvester.py new file mode 100644 index 00000000..9b0b15ec --- /dev/null +++ b/ckanext/dcat/tests/nose/test_harvester.py @@ -0,0 +1,1505 @@ +# -*- coding: utf-8 -*- + +from builtins import str +from builtins import range +from builtins import object +from collections import defaultdict + +import nose +import httpretty +from mock import patch + +from six.moves import xrange + +import ckan.plugins as p +import ckantoolkit.tests.helpers as h + +import ckanext.harvest.model as harvest_model +from ckanext.harvest import queue + +from ckanext.dcat.harvesters import DCATRDFHarvester, DCATJSONHarvester +from ckanext.dcat.interfaces import IDCATRDFHarvester +import ckanext.dcat.harvesters.rdf + + +eq_ = nose.tools.eq_ + + +# This horrible monkey patch is needed because httpretty does not play well +# with redis, so we need to disable it straight after the mocked call is used. +# See https://github.com/gabrielfalcao/HTTPretty/issues/113 + +# Start monkey-patch + +original_rdf_get_content_and_type = DCATRDFHarvester._get_content_and_type + +def _patched_rdf_get_content_and_type(self, url, harvest_job, page=1, content_type=None): + + httpretty.enable() + + value1, value2 = original_rdf_get_content_and_type(self, url, harvest_job, page, content_type) + + httpretty.disable() + + return value1, value2 + +DCATRDFHarvester._get_content_and_type = _patched_rdf_get_content_and_type + +original_json_get_content_and_type = DCATJSONHarvester._get_content_and_type + +def _patched_json_get_content_and_type(self, url, harvest_job, page=1, content_type=None): + + httpretty.enable() + + value1, value2 = original_json_get_content_and_type(self, url, harvest_job, page, content_type) + + httpretty.disable() + + return value1, value2 + +DCATJSONHarvester._get_content_and_type = _patched_json_get_content_and_type + +# End monkey-patch + + +class TestRDFHarvester(p.SingletonPlugin): + + p.implements(IDCATRDFHarvester) + + calls = defaultdict(int) + + def before_download(self, url, harvest_job): + + self.calls['before_download'] += 1 + + if url == 'http://return.none': + return None, [] + elif url == 'http://return.errors': + return None, ['Error 1', 'Error 2'] + else: + return url, [] + + def update_session(self, session): + self.calls['update_session'] += 1 + session.headers.update({'x-test': 'true'}) + return session + + def after_download(self, content, harvest_job): + + self.calls['after_download'] += 1 + + if content == 'return.empty.content': + return None, [] + elif content == 'return.errors': + return None, ['Error 1', 'Error 2'] + else: + return content, [] + + def before_update(self, harvest_object, dataset_dict, temp_dict): + self.calls['before_update'] += 1 + + def after_update(self, harvest_object, dataset_dict, temp_dict): + self.calls['after_update'] += 1 + return None + + def before_create(self, harvest_object, dataset_dict, temp_dict): + self.calls['before_create'] += 1 + + def after_create(self, harvest_object, dataset_dict, temp_dict): + self.calls['after_create'] += 1 + return None + + +class TestRDFNullHarvester(TestRDFHarvester): + p.implements(IDCATRDFHarvester) + def before_update(self, harvest_object, dataset_dict, temp_dict): + super(TestRDFNullHarvester, self).before_update(harvest_object, dataset_dict, temp_dict) + dataset_dict.clear() + + def before_create(self, harvest_object, dataset_dict, temp_dict): + super(TestRDFNullHarvester, self).before_create(harvest_object, dataset_dict, temp_dict) + dataset_dict.clear() + + +class TestRDFExceptionHarvester(TestRDFHarvester): + p.implements(IDCATRDFHarvester) + + raised_exception = False + + def before_create(self, harvest_object, dataset_dict, temp_dict): + super(TestRDFExceptionHarvester, self).before_create(harvest_object, dataset_dict, temp_dict) + if not self.raised_exception: + self.raised_exception = True + raise Exception("raising exception in before_create") + + +class TestDCATHarvestUnit(object): + + def test_get_guid_uri_root(self): + + dataset = { + 'name': 'test-dataset', + 'uri': 'http://dataset/uri', + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'http://dataset/uri') + + def test_get_guid_identifier_root(self): + + dataset = { + 'name': 'test-dataset', + 'identifier': 'http://dataset/uri', + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'http://dataset/uri') + + def test_get_guid_uri(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + {'key': 'uri', 'value': 'http://dataset/uri'}, + {'key': 'dcat_identifier', 'value': 'dataset_dcat_id'}, + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'http://dataset/uri') + + def test_get_guid_identifier(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + {'key': 'identifier', 'value': 'dataset_dcat_id'}, + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'dataset_dcat_id') + + def test_get_guid_dcat_identifier(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + {'key': 'dcat_identifier', 'value': 'dataset_dcat_id'}, + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'dataset_dcat_id') + + def test_get_guid_uri_none(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + {'key': 'uri', 'value': None}, + {'key': 'dcat_identifier', 'value': 'dataset_dcat_id'}, + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'dataset_dcat_id') + + def test_get_guid_dcat_identifier_none(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + {'key': 'dcat_identifier', 'value': None}, + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'test-dataset') + + def test_get_guid_source_url_name(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset, 'http://source_url') + + eq_(guid, 'http://source_url/test-dataset') + + guid = DCATRDFHarvester()._get_guid(dataset, 'http://source_url/') + + eq_(guid, 'http://source_url/test-dataset') + + def test_get_guid_name(self): + + dataset = { + 'name': 'test-dataset', + 'extras': [ + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, 'test-dataset') + + def test_get_guid_none(self): + + dataset = { + 'extras': [ + ] + } + + guid = DCATRDFHarvester()._get_guid(dataset) + + eq_(guid, None) + + +class FunctionalHarvestTest(object): + + @classmethod + def setup_class(cls): + + h.reset_db() + + cls.gather_consumer = queue.get_gather_consumer() + cls.fetch_consumer = queue.get_fetch_consumer() + + # Minimal remote RDF file + cls.rdf_mock_url = 'http://some.dcat.file.rdf' + cls.rdf_content_type = 'application/rdf+xml' + cls.rdf_content = ''' + + + + + Example dataset 1 + + + + + Example dataset 2 + + + + + ''' + + # Minimal remote RDF file with pagination (1) + # Use slashes for paginated URLs because HTTPretty won't distinguish + # query strings + cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf' + cls.rdf_content_pagination_1 = ''' + + + + + Example dataset 1 + + + + + Example dataset 2 + + + + + 4 + http://some.dcat.file.pagination.rdf/page/2 + 2 + http://some.dcat.file.pagination.rdf/page/2 + http://some.dcat.file.pagination.rdf/page/1 + + + ''' + + # Minimal remote RDF file with pagination (2) + cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2' + cls.rdf_content_pagination_2 = ''' + + + + + Example dataset 3 + + + + + Example dataset 4 + + + + + 4 + http://some.dcat.file.pagination.rdf/page/2 + 2 + http://some.dcat.file.pagination.rdf/page/1 + http://some.dcat.file.pagination.rdf/page/1 + + + ''' + + # Minimal remote RDF file + cls.rdf_mock_url = 'http://some.dcat.file.rdf' + cls.rdf_content_type = 'application/rdf+xml' + cls.rdf_content = ''' + + + + + Example dataset 1 + + + + + Example dataset 2 + + + + + ''' + + cls.rdf_mock_url_duplicates = 'http://some.dcat.file.duplicates.rdf' + cls.rdf_duplicate_titles = ''' + + + + + Example dataset + + + + + Example dataset + + + + + ''' + + cls.rdf_remote_file_small = ''' + + + + + Example dataset 1 + + + + + ''' + + # RDF with minimal distribution + cls.rdf_content_with_distribution_uri = ''' + + + + + Example dataset 1 + + + Example resource 1 + http://data.some.org/download.zip + + + + + + + ''' + cls.rdf_content_with_distribution = ''' + + + + + Example dataset 1 + + + Example resource 1 + http://data.some.org/download.zip + + + + + + + ''' + + cls.rdf_remote_file_invalid = ''' + + + ''' + + #Minimal remote turtle file + cls.ttl_mock_url = 'http://some.dcat.file.ttl' + cls.ttl_content_type = 'text/turtle' + cls.ttl_content = '''@prefix dcat: . + @prefix dc: . + + a dcat:Catalog ; + dcat:dataset , . + + a dcat:Dataset ; + dc:title "Example dataset 1" . + + a dcat:Dataset ; + dc:title "Example dataset 2" . + ''' + cls.ttl_remote_file_small = '''@prefix dcat: . + @prefix dc: . + + a dcat:Catalog ; + dcat:dataset , . + + a dcat:Dataset ; + dc:title "Example dataset 1" . + ''' + cls.ttl_unicode_in_keywords = u'''@prefix dcat: . + @prefix dc: . + + a dcat:Catalog ; + dcat:dataset . + + a dcat:Dataset ; + dc:title "Example dataset 1" ; + dcat:keyword "förskola", "Garduña" . + + a dcat:Dataset ; + dc:title "Example dataset 2" ; + dcat:keyword "San Sebastián", "Ελλάδα" . + ''' + cls.ttl_commas_in_keywords = u'''@prefix dcat: . + @prefix dc: . + + a dcat:Catalog ; + dcat:dataset . + + a dcat:Dataset ; + dc:title "Example dataset 1" ; + dcat:keyword "Utbildning, kontaktuppgifter" . + + a dcat:Dataset ; + dc:title "Example dataset 2" ; + dcat:keyword "Trees, forest, shrub" . + ''' + cls.ttl_remote_file_invalid = '''@prefix dcat: . + @prefix dc: . + + a dcat:Catalog ; + + a dcat:Dataset ; + dc:title "Example dataset 1" . + ''' + + def setup(self): + + harvest_model.setup() + + queue.purge_queues() + + def teardown(cls): + h.reset_db() + + def _create_harvest_source(self, mock_url, **kwargs): + + source_dict = { + 'title': 'Test RDF DCAT Source', + 'name': 'test-rdf-dcat-source', + 'url': mock_url, + 'source_type': 'dcat_rdf', + } + + source_dict.update(**kwargs) + + harvest_source = h.call_action('harvest_source_create', + {}, **source_dict) + + return harvest_source + + def _create_harvest_job(self, harvest_source_id): + + harvest_job = h.call_action('harvest_job_create', + {}, source_id=harvest_source_id) + + return harvest_job + + def _run_jobs(self, harvest_source_id=None): + try: + h.call_action('harvest_jobs_run', + {}, source_id=harvest_source_id) + except Exception as e: + if (str(e) == 'There are no new harvesting jobs'): + pass + + def _gather_queue(self, num_jobs=1): + + for job in range(num_jobs): + # Pop one item off the queue (the job id) and run the callback + reply = self.gather_consumer.basic_get( + queue='ckan.harvest.gather.test') + + # Make sure something was sent to the gather queue + assert reply[2], 'Empty gather queue' + + # Send the item to the gather callback, which will call the + # harvester gather_stage + queue.gather_callback(self.gather_consumer, *reply) + + def _fetch_queue(self, num_objects=1): + + for _object in range(num_objects): + # Pop item from the fetch queues (object ids) and run the callback, + # one for each object created + reply = self.fetch_consumer.basic_get( + queue='ckan.harvest.fetch.test') + + # Make sure something was sent to the fetch queue + assert reply[2], 'Empty fetch queue, the gather stage failed' + + # Send the item to the fetch callback, which will call the + # harvester fetch_stage and import_stage + queue.fetch_callback(self.fetch_consumer, *reply) + + def _run_full_job(self, harvest_source_id, num_jobs=1, num_objects=1): + + # Create new job for the source + self._create_harvest_job(harvest_source_id) + + # Run the job + self._run_jobs(harvest_source_id) + + # Handle the gather queue + self._gather_queue(num_jobs) + + # Handle the fetch queue + self._fetch_queue(num_objects) + + +class TestDCATHarvestFunctional(FunctionalHarvestTest): + + def test_harvest_create_rdf(self): + + self._test_harvest_create(self.rdf_mock_url, + self.rdf_content, + self.rdf_content_type) + + def test_harvest_create_ttl(self): + + self._test_harvest_create(self.ttl_mock_url, + self.ttl_content, + self.ttl_content_type) + + def test_harvest_create_with_config_content_Type(self): + + self._test_harvest_create(self.ttl_mock_url, + self.ttl_content, + 'text/plain', + config='{"rdf_format":"text/turtle"}') + + def test_harvest_create_unicode_keywords(self): + + self._test_harvest_create(self.ttl_mock_url, + self.ttl_unicode_in_keywords, + self.ttl_content_type) + + def test_harvest_create_commas_keywords(self): + + self._test_harvest_create(self.ttl_mock_url, + self.ttl_commas_in_keywords, + self.ttl_content_type) + + def _test_harvest_create(self, url, content, content_type, **kwargs): + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url, **kwargs) + + self._run_full_job(harvest_source['id'], num_objects=2) + + # Check that two datasets were created + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 2) + for result in results['results']: + assert result['title'] in ('Example dataset 1', + 'Example dataset 2') + + def test_harvest_create_rdf_pagination(self): + + # Mock the GET requests needed to get the file + httpretty.register_uri(httpretty.GET, self.rdf_mock_url_pagination_1, + body=self.rdf_content_pagination_1, + content_type=self.rdf_content_type) + + httpretty.register_uri(httpretty.GET, self.rdf_mock_url_pagination_2, + body=self.rdf_content_pagination_2, + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # them as well + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url_pagination_1, + status=405, + content_type=self.rdf_content_type) + + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url_pagination_2, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source( + self.rdf_mock_url_pagination_1) + + self._run_full_job(harvest_source['id'], num_objects=4) + + # Check that four datasets were created + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 4) + eq_(sorted([d['title'] for d in results['results']]), + ['Example dataset 1', 'Example dataset 2', + 'Example dataset 3', 'Example dataset 4']) + + def test_harvest_create_rdf_pagination_same_content(self): + + # Mock the GET requests needed to get the file. Two different URLs but + # same content to mock a misconfigured server + httpretty.register_uri(httpretty.GET, self.rdf_mock_url_pagination_1, + body=self.rdf_content_pagination_1, + content_type=self.rdf_content_type) + + httpretty.register_uri(httpretty.GET, self.rdf_mock_url_pagination_2, + body=self.rdf_content_pagination_1, + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # them as well + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url_pagination_1, + status=405, + content_type=self.rdf_content_type) + + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url_pagination_2, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source( + self.rdf_mock_url_pagination_1) + + self._run_full_job(harvest_source['id'], num_objects=2) + + # Check that two datasets were created + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 2) + eq_(sorted([d['title'] for d in results['results']]), + ['Example dataset 1', 'Example dataset 2']) + + def test_harvest_update_rdf(self): + + self._test_harvest_update(self.rdf_mock_url, + self.rdf_content, + self.rdf_content_type) + + def test_harvest_update_ttl(self): + + self._test_harvest_update(self.ttl_mock_url, + self.ttl_content, + self.ttl_content_type) + + def test_harvest_update_unicode_keywords(self): + + self._test_harvest_create(self.ttl_mock_url, + self.ttl_unicode_in_keywords, + self.ttl_content_type) + + def test_harvest_update_commas_keywords(self): + + self._test_harvest_update(self.ttl_mock_url, + self.ttl_commas_in_keywords, + self.ttl_content_type) + + def _test_harvest_update(self, url, content, content_type): + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + # First run, will create two datasets as previously tested + self._run_full_job(harvest_source['id'], num_objects=2) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Mock an update in the remote file + new_file = content.replace('Example dataset 1', + 'Example dataset 1 (updated)') + httpretty.register_uri(httpretty.GET, url, + body=new_file, content_type=content_type) + + # Run a second job + self._run_full_job(harvest_source['id'], num_objects=2) + + # Check that we still have two datasets + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 2) + + # Check that the dataset was updated + for result in results['results']: + assert result['title'] in ('Example dataset 1 (updated)', + 'Example dataset 2') + + def test_harvest_update_existing_resources(self): + + existing, new = self._test_harvest_update_resources(self.rdf_mock_url, + self.rdf_content_with_distribution_uri, + self.rdf_content_type) + eq_(new['uri'], 'https://data.some.org/catalog/datasets/1/resource/1') + eq_(new['uri'], existing['uri']) + eq_(new['id'], existing['id']) + + def test_harvest_update_new_resources(self): + + existing, new = self._test_harvest_update_resources(self.rdf_mock_url, + self.rdf_content_with_distribution, + self.rdf_content_type) + eq_(existing['uri'], '') + eq_(new['uri'], '') + nose.tools.assert_is_not(new['id'], existing['id']) + + def _test_harvest_update_resources(self, url, content, content_type): + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + # First run, create the dataset with the resource + self._run_full_job(harvest_source['id'], num_objects=1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # get the created dataset + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + eq_(results['count'], 1) + + existing_dataset = results['results'][0] + existing_resource = existing_dataset.get('resources')[0] + + # Mock an update in the remote file + new_file = content.replace('Example resource 1', + 'Example resource 1 (updated)') + httpretty.register_uri(httpretty.GET, url, + body=new_file, content_type=content_type) + + # Run a second job + self._run_full_job(harvest_source['id']) + + # get the updated dataset + new_results = h.call_action('package_search', {}, fq=fq) + eq_(new_results['count'], 1) + + new_dataset = new_results['results'][0] + new_resource = new_dataset.get('resources')[0] + + eq_(existing_resource['name'], 'Example resource 1') + eq_(len(new_dataset.get('resources')), 1) + eq_(new_resource['name'], 'Example resource 1 (updated)') + return (existing_resource, new_resource) + + def test_harvest_delete_rdf(self): + + self._test_harvest_delete(self.rdf_mock_url, + self.rdf_content, + self.rdf_remote_file_small, + self.rdf_content_type) + + def test_harvest_delete_ttl(self): + + self._test_harvest_delete(self.ttl_mock_url, + self.ttl_content, + self.ttl_remote_file_small, + self.ttl_content_type) + + def _test_harvest_delete(self, url, content, content_small, content_type): + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + # First run, will create two datasets as previously tested + self._run_full_job(harvest_source['id'], num_objects=2) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Mock a deletion in the remote file + httpretty.register_uri(httpretty.GET, url, + body=content_small, content_type=content_type) + + # Run a second job + self._run_full_job(harvest_source['id'], num_objects=2) + + # Check that we only have one dataset + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 1) + + eq_(results['results'][0]['title'], 'Example dataset 1') + + def test_harvest_bad_format_rdf(self): + + self._test_harvest_bad_format(self.rdf_mock_url, + self.rdf_remote_file_invalid, + self.rdf_content_type) + + def test_harvest_bad_format_ttl(self): + + self._test_harvest_bad_format(self.ttl_mock_url, + self.ttl_remote_file_invalid, + self.ttl_content_type) + + def _test_harvest_bad_format(self, url, bad_content, content_type): + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=bad_content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_(last_job_status['status'], 'Finished') + assert ('Error parsing the RDF file' + in last_job_status['gather_error_summary'][0][0]) + + @patch.object(ckanext.dcat.harvesters.rdf.RDFParser, 'datasets') + def test_harvest_exception_in_profile(self, mock_datasets): + mock_datasets.side_effect = Exception + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, self.rdf_mock_url, + body=self.rdf_content, content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url, + status=405, content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(self.rdf_mock_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_(last_job_status['status'], 'Finished') + assert ('Error when processsing dataset' + in last_job_status['gather_error_summary'][0][0]) + + def test_harvest_create_duplicate_titles(self): + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, self.rdf_mock_url_duplicates, + body=self.rdf_duplicate_titles, + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url_duplicates, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(self.rdf_mock_url_duplicates) + + self._run_full_job(harvest_source['id'], num_objects=2) + + # Check that two datasets were created + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], 2) + for result in results['results']: + assert result['name'] in ('example-dataset', + 'example-dataset-1') + + +class TestDCATHarvestFunctionalExtensionPoints(FunctionalHarvestTest): + + @classmethod + def setup_class(self): + + super(TestDCATHarvestFunctionalExtensionPoints, self).setup_class() + + p.load('test_rdf_harvester') + + @classmethod + def teardown_class(self): + + p.unload('test_rdf_harvester') + + def setup(self): + + super(TestDCATHarvestFunctionalExtensionPoints, self).setup() + + plugin = p.get_plugin('test_rdf_harvester') + plugin.calls = defaultdict(int) + + def teardown(self): + + super(TestDCATHarvestFunctionalExtensionPoints, self).teardown() + + plugin = p.get_plugin('test_rdf_harvester') + plugin.calls = defaultdict(int) + + def test_harvest_before_download_extension_point_gets_called(self): + + plugin = p.get_plugin('test_rdf_harvester') + + harvest_source = self._create_harvest_source(self.rdf_mock_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['before_download'], 1) + + def test_harvest_before_download_null_url_stops_gather_stage(self): + + plugin = p.get_plugin('test_rdf_harvester') + + source_url = 'http://return.none' + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, source_url, + body=self.rdf_content, + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, source_url, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(source_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['before_download'], 1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Check that the file was not requested + assert 'return.none' not in httpretty.last_request().headers['host'] + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_(last_job_status['status'], 'Finished') + + eq_(last_job_status['stats']['added'], 0) + + def test_harvest_before_download_errors_get_stored(self): + + plugin = p.get_plugin('test_rdf_harvester') + + source_url = 'http://return.errors' + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, source_url, + body=self.rdf_content, + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, source_url, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(source_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['before_download'], 1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Check that the file was not requested + assert 'return.errors' not in httpretty.last_request().headers['host'] + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_('Error 1', last_job_status['gather_error_summary'][0][0]) + eq_('Error 2', last_job_status['gather_error_summary'][1][0]) + + def test_harvest_update_session_extension_point_gets_called(self): + + plugin = p.get_plugin('test_rdf_harvester') + + harvest_source = self._create_harvest_source(self.rdf_mock_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['update_session'], 1) + + def test_harvest_update_session_add_header(self): + + plugin = p.get_plugin('test_rdf_harvester') + + harvest_source = self._create_harvest_source(self.rdf_mock_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['update_session'], 1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Check that the header was actually set + assert ('true' + in httpretty.last_request().headers['x-test']) + + def test_harvest_after_download_extension_point_gets_called(self): + + plugin = p.get_plugin('test_rdf_harvester') + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, self.rdf_mock_url) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, self.rdf_mock_url, + status=405) + + harvest_source = self._create_harvest_source(self.rdf_mock_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['after_download'], 1) + + def test_harvest_after_download_empty_content_stops_gather_stage(self): + + plugin = p.get_plugin('test_rdf_harvester') + + source_url = 'http://return.empty.content' + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, source_url, + body='return.empty.content', + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, source_url, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(source_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['after_download'], 1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Check that the file was requested + assert ('return.empty.content' + in httpretty.last_request().headers['host']) + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_(last_job_status['status'], 'Finished') + + eq_(last_job_status['stats']['added'], 0) + + def test_harvest_after_download_errors_get_stored(self): + + plugin = p.get_plugin('test_rdf_harvester') + + source_url = 'http://return.content.errors' + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, source_url, + body='return.errors', + content_type=self.rdf_content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, source_url, + status=405, + content_type=self.rdf_content_type) + + harvest_source = self._create_harvest_source(source_url) + self._create_harvest_job(harvest_source['id']) + self._run_jobs(harvest_source['id']) + self._gather_queue(1) + + eq_(plugin.calls['after_download'], 1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Check that the file was requested + assert ('return.content.errors' + in httpretty.last_request().headers['host']) + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + + last_job_status = harvest_source['status']['last_job'] + + eq_('Error 1', last_job_status['gather_error_summary'][0][0]) + eq_('Error 2', last_job_status['gather_error_summary'][1][0]) + + def test_harvest_import_extensions_point_gets_called(self): + + plugin = p.get_plugin('test_rdf_harvester') + + url = self.rdf_mock_url + content = self.rdf_content + content_type = self.rdf_content_type + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + # First run, will create two datasets as previously tested + self._run_full_job(harvest_source['id'], num_objects=2) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Get the harvest source with the udpated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + last_job_status = harvest_source['status']['last_job'] + eq_(last_job_status['status'], 'Finished') + + eq_(plugin.calls['before_create'], 2) + eq_(plugin.calls['after_create'], 2) + eq_(plugin.calls['before_update'], 0) + eq_(plugin.calls['after_update'], 0) + + # Mock an update in the remote file + new_file = content.replace('Example dataset 1', + 'Example dataset 1 (updated)') + httpretty.register_uri(httpretty.GET, url, + body=new_file, content_type=content_type) + + # Run a second job + self._run_full_job(harvest_source['id'], num_objects=2) + + eq_(plugin.calls['before_create'], 2) + eq_(plugin.calls['after_create'], 2) + eq_(plugin.calls['before_update'], 2) + eq_(plugin.calls['after_update'], 2) + + +class TestDCATHarvestFunctionalSetNull(FunctionalHarvestTest): + + @classmethod + def setup_class(self): + super(TestDCATHarvestFunctionalSetNull, self).setup_class() + p.load('test_rdf_null_harvester') + + @classmethod + def teardown_class(self): + p.unload('test_rdf_null_harvester') + + def setup(self): + super(TestDCATHarvestFunctionalSetNull, self).setup() + + plugin = p.get_plugin('test_rdf_null_harvester') + plugin.calls = defaultdict(int) + + def teardown(self): + super(TestDCATHarvestFunctionalSetNull, self).teardown() + + plugin = p.get_plugin('test_rdf_null_harvester') + plugin.calls = defaultdict(int) + + def test_harvest_with_before_create_null(self): + plugin = p.get_plugin('test_rdf_null_harvester') + + url = self.rdf_mock_url + content = self.rdf_content + content_type = self.rdf_content_type + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + self._run_full_job(harvest_source['id'], num_objects=2) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Get the harvest source with the updated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + last_job_status = harvest_source['status']['last_job'] + eq_(last_job_status['status'], 'Finished') + + nose.tools.assert_dict_equal( + last_job_status['stats'], + { + 'deleted': 0, + 'added': 0, + 'updated': 0, + 'not modified': 2, + 'errored': 0 + } + ) + + eq_(plugin.calls['before_create'], 2) + eq_(plugin.calls['after_create'], 0) + eq_(plugin.calls['before_update'], 0) + eq_(plugin.calls['after_update'], 0) + + +class TestDCATHarvestFunctionalRaiseExcpetion(FunctionalHarvestTest): + + @classmethod + def setup_class(self): + super(TestDCATHarvestFunctionalRaiseExcpetion, self).setup_class() + p.load('test_rdf_exception_harvester') + + @classmethod + def teardown_class(self): + p.unload('test_rdf_exception_harvester') + + def setup(self): + super(TestDCATHarvestFunctionalRaiseExcpetion, self).setup() + + plugin = p.get_plugin('test_rdf_exception_harvester') + plugin.calls = defaultdict(int) + + def teardown(self): + super(TestDCATHarvestFunctionalRaiseExcpetion, self).teardown() + + plugin = p.get_plugin('test_rdf_exception_harvester') + plugin.calls = defaultdict(int) + + def test_harvest_with_before_create_raising_exception(self): + plugin = p.get_plugin('test_rdf_exception_harvester') + + url = self.rdf_mock_url + content = self.rdf_content + content_type = self.rdf_content_type + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + harvest_source = self._create_harvest_source(url) + + self._run_full_job(harvest_source['id'], num_objects=2) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # Get the harvest source with the updated status + harvest_source = h.call_action('harvest_source_show', + id=harvest_source['id']) + last_job_status = harvest_source['status']['last_job'] + eq_(last_job_status['status'], 'Finished') + + assert ('Error importing dataset' + in last_job_status['object_error_summary'][0][0]) + + nose.tools.assert_dict_equal( + last_job_status['stats'], + { + 'deleted': 0, + 'added': 1, + 'updated': 0, + 'not modified': 0, + 'errored': 1 + } + ) + + eq_(plugin.calls['before_create'], 2) + eq_(plugin.calls['after_create'], 1) + eq_(plugin.calls['before_update'], 0) + eq_(plugin.calls['after_update'], 0) + + +class TestDCATRDFHarvester(object): + + def test_validates_correct_config(self): + harvester = DCATRDFHarvester() + + for config in ['{}', '{"rdf_format":"text/turtle"}']: + eq_(config, harvester.validate_config(config)) + + def test_does_not_validate_incorrect_config(self): + harvester = DCATRDFHarvester() + + for config in ['invalid', '{invalid}', '{rdf_format:invalid}']: + try: + harvester.validate_config(config) + assert False + except ValueError: + assert True + + +class TestIDCATRDFHarvester(object): + + def test_before_download(self): + + i = IDCATRDFHarvester() + + url = 'http://some.url' + + values = i.before_download(url, {}) + + eq_(values[0], url) + eq_(values[1], []) + + def test_after_download(self): + + i = IDCATRDFHarvester() + + content = 'some.content' + + values = i.after_download(content, {}) + + eq_(values[0], content) + eq_(values[1], []) diff --git a/ckanext/dcat/tests/nose/test_json_harvester.py b/ckanext/dcat/tests/nose/test_json_harvester.py new file mode 100644 index 00000000..190e1bd0 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_json_harvester.py @@ -0,0 +1,327 @@ +from __future__ import absolute_import +from builtins import object +import httpretty +from mock import call, patch, Mock + +import nose + +from ckan.logic import ValidationError +import ckantoolkit.tests.helpers as h + +import ckan.tests.factories as factories + +from ckanext.dcat.harvesters._json import copy_across_resource_ids, DCATJSONHarvester +from .test_harvester import FunctionalHarvestTest + +eq_ = nose.tools.eq_ + +class TestDCATJSONHarvestFunctional(FunctionalHarvestTest): + + # invalid tags dataset + json_content_invalid_tags = ''' + { + "@type": "dcat:Dataset", + "name": "Invalid tags", + "identifier": "http://example.com/datasets/invalid_example", + "title": "Example dataset with invalid tags", + "description": "Invalid keywords", + "publisher": {"name":"Example Department of Wildlife"}, + "license": "https://example.com/license", + "keyword": ["example", "test's", "invalid & wrong"] + } + ''' + + @classmethod + def setup_class(cls): + super(TestDCATJSONHarvestFunctional, cls).setup_class() + + # Remote DCAT JSON / data.json file + cls.json_mock_url = 'http://some.dcat.file.json' + cls.json_content_type = 'application/json' + + # minimal dataset + cls.json_content = ''' +{ + "dataset":[ + {"@type": "dcat:Dataset", + "identifier": "http://example.com/datasets/example1", + "title": "Example dataset 1", + "description": "Lots of species", + "publisher": {"name": "Example Department of Wildlife"}, + "license": "https://example.com/license" + }, + {"@type": "dcat:Dataset", + "identifier": "http://example.com/datasets/example1", + "title": "Example dataset 2", + "description": "More species", + "publisher": {"name":"Example Department of Wildlife"}, + "license": "https://example.com/license" + } + ] +} + ''' + + cls.json_content_with_distribution = ''' +{ + "dataset":[ + {"@type": "dcat:Dataset", + "identifier": "http://example.com/datasets/example1", + "title": "Example dataset 1", + "description": "Lots of species", + "publisher": {"name": "Example Department of Wildlife"}, + "license": "https://example.com/license", + "distribution":[ + {"@type":"dcat:Distribution", + "title":"Example resource 1", + "format":"Web page", + "mediaType":"text/html", + "accessURL":"http://example.com/datasets/example1"} + ] + } + ] +} + ''' + + # invalid_tags dataset + cls.json_content_invalid_tags_dataset = '{"dataset":[%s]}' % cls.json_content_invalid_tags + + def test_harvest_create(self): + + self._test_harvest_create(self.json_mock_url, + self.json_content, + self.json_content_type, + exp_titles=['Example dataset 1', 'Example dataset 2']) + + def _test_harvest_create( + self, url, content, content_type, num_datasets=2, + exp_num_datasets=2, exp_titles=[], + **kwargs + ): + + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content, content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + kwargs['source_type'] = 'dcat_json' + harvest_source = self._create_harvest_source(url, **kwargs) + + self._run_full_job(harvest_source['id'], num_objects=num_datasets) + + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + + eq_(results['count'], exp_num_datasets) + + if exp_titles: + for result in results['results']: + assert result['title'] in exp_titles + + def test_harvest_update_existing_resources(self): + + content = self.json_content_with_distribution + existing_resources, new_resources = \ + self._test_harvest_twice(content, content) + + # number of resources unchanged + eq_(len(existing_resources), 1) + eq_(len(new_resources), 1) + # because the resource metadata is unchanged, the ID is kept the same + eq_(new_resources[0]['id'], existing_resources[0]['id']) + + def test_harvest_update_new_resources(self): + + content = self.json_content_with_distribution + content_modified = content.replace( + '"accessURL":"http://example.com/datasets/example1"', + '"accessURL":"http://example.com/datasets/new"') + existing_resources, new_resources = \ + self._test_harvest_twice(content, content) + + # number of resources unchanged + eq_(len(existing_resources), 1) + eq_(len(new_resources), 1) + # because the resource metadata has a new URL, the ID is new + nose.tools.assert_is_not(new_resources[0]['id'], + existing_resources[0]['id']) + + def _test_harvest_twice(self, content_first_harvest, + content_second_harvest): + '''Based on _test_harvest_update_resources''' + url = self.json_mock_url + content_type = self.json_content_type + # Mock the GET request to get the file + httpretty.register_uri(httpretty.GET, url, + body=content_first_harvest, + content_type=content_type) + + # The harvester will try to do a HEAD request first so we need to mock + # this as well + httpretty.register_uri(httpretty.HEAD, url, + status=405, content_type=content_type) + + kwargs = {'source_type': 'dcat_json'} + harvest_source = self._create_harvest_source(url, **kwargs) + + # First run, create the dataset with the resource + self._run_full_job(harvest_source['id'], num_objects=1) + + # Run the jobs to mark the previous one as Finished + self._run_jobs() + + # get the created dataset + fq = "+type:dataset harvest_source_id:{0}".format(harvest_source['id']) + results = h.call_action('package_search', {}, fq=fq) + eq_(results['count'], 1) + + existing_dataset = results['results'][0] + existing_resources = existing_dataset.get('resources') + + # Mock an update in the remote dataset. + # Change title just to be sure we harvest ok + content_second_harvest = \ + content_second_harvest.replace('Example dataset 1', + 'Example dataset 1 (updated)') + httpretty.register_uri(httpretty.GET, url, + body=content_second_harvest, + content_type=content_type) + + # Run a second job + self._run_full_job(harvest_source['id']) + + # get the updated dataset + new_results = h.call_action('package_search', {}, fq=fq) + eq_(new_results['count'], 1) + + new_dataset = new_results['results'][0] + new_resources = new_dataset.get('resources') + + eq_(existing_dataset['title'], 'Example dataset 1') + eq_(new_dataset['title'], 'Example dataset 1 (updated)') + + return (existing_resources, new_resources) + + def test_harvest_does_not_create_with_invalid_tags(self): + self._test_harvest_create( + 'http://some.dcat.file.invalid.json', + self.json_content_invalid_tags_dataset, + self.json_content_type, + num_datasets=1, + exp_num_datasets=0) + + +class TestCopyAcrossResourceIds(object): + def test_copied_because_same_uri(self): + harvested_dataset = {'resources': [ + {'uri': 'http://abc', 'url': 'http://abc'}]} + copy_across_resource_ids({'resources': [ + {'uri': 'http://abc', 'url': 'http://def', 'id': '1'}]}, + harvested_dataset, + ) + eq_(harvested_dataset['resources'][0].get('id'), '1') + eq_(harvested_dataset['resources'][0].get('url'), 'http://abc') + + def test_copied_because_same_url(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'id': '1'}]}, + harvested_dataset, + ) + eq_(harvested_dataset['resources'][0].get('id'), '1') + + def test_copied_with_same_url_and_changed_title(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc', 'title': 'link updated'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'title': 'link', 'id': '1'}]}, + harvested_dataset, + ) + eq_(harvested_dataset['resources'][0].get('id'), '1') + + def test_copied_with_repeated_urls_but_unique_titles(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc', 'title': 'link1'}, + {'url': 'http://abc', 'title': 'link5'}, + {'url': 'http://abc', 'title': 'link3'}, + {'url': 'http://abc', 'title': 'link2'}, + {'url': 'http://abc', 'title': 'link4'}, + {'url': 'http://abc', 'title': 'link new'}, + ]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'title': 'link1', 'id': '1'}, + {'url': 'http://abc', 'title': 'link2', 'id': '2'}, + {'url': 'http://abc', 'title': 'link3', 'id': '3'}, + {'url': 'http://abc', 'title': 'link4', 'id': '4'}, + {'url': 'http://abc', 'title': 'link5', 'id': '5'}, + ]}, + harvested_dataset, + ) + eq_([(r.get('id'), r['title']) for r in harvested_dataset['resources']], + [('1', 'link1'), ('5', 'link5'), ('3', 'link3'), ('2', 'link2'), + ('4', 'link4'), (None, 'link new')]) + + def test_not_copied_because_completely_different(self): + harvested_dataset = {'resources': [ + {'url': 'http://def', 'title': 'link other'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'title': 'link', 'id': '1'}]}, + harvested_dataset, + ) + eq_(harvested_dataset['resources'][0].get('id'), None) + + +class TestImportStage(object): + + @classmethod + def setup_class(cls): + h.reset_db() + + class MockHarvestObject(object): + guid = 'test_guid' + content = TestDCATJSONHarvestFunctional.json_content_invalid_tags + + class MockStatus(object): + key = 'status' + value = 'new' + + extras = [MockStatus()] + package = None + + class MockSource(object): + id = 'test_id' + + source = MockSource() + + def add(self): + pass + + class MockSourceDataset(object): + def __init__(self, owner_org=None): + self.owner_org = owner_org['id'] + + @patch('ckanext.dcat.harvesters._json.model.Package.get') + @patch('ckanext.dcat.harvesters._json.DCATJSONHarvester._save_object_error') + def test_import_invalid_tags( + self, mock_save_object_error, mock_model_package_get + ): + user = factories.User() + owner_org = factories.Organization( + users=[{'name': user['id'], 'capacity': 'admin'}] + ) + + mock_model_package_get.return_value = self.MockSourceDataset(owner_org) + + harvester = DCATJSONHarvester() + + mock_harvest_object = self.MockHarvestObject() + harvester.import_stage(mock_harvest_object) + + args, _ = mock_save_object_error.call_args_list[0] + + assert 'Error importing dataset Invalid tags: ValidationError(None,)' in args[0] + assert '{\'tags\': [{}, u\'Tag "test\\\'s" must be alphanumeric characters or symbols: -_.\', u\'Tag "invalid & wrong" must be alphanumeric characters or symbols: -_.\']}' in args[0] diff --git a/ckanext/dcat/tests/nose/test_logic.py b/ckanext/dcat/tests/nose/test_logic.py new file mode 100644 index 00000000..93e22151 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_logic.py @@ -0,0 +1,309 @@ +from builtins import range +from builtins import object +import nose +import mock + +from six.moves import xrange + +from ckantoolkit import config + +from ckan.plugins import toolkit + +from ckantoolkit.tests import helpers, factories + +from ckanext.dcat.logic import _pagination_info +from ckanext.dcat.processors import RDFParser + +from ckanext.dcat.tests.nose import DCATFunctionalTestBase + +eq_ = nose.tools.eq_ +assert_raises = nose.tools.assert_raises + + +class TestPagination(object): + + @helpers.change_config('ckanext.dcat.datasets_per_page', 10) + @helpers.change_config('ckan.site_url', 'http://example.com') + @mock.patch('ckan.plugins.toolkit.request') + def test_pagination(self, mock_request): + + mock_request.params = {} + mock_request.host_url = 'http://ckan.example.com' + mock_request.path = '' + + # No page defined (defaults to 1) + query = { + 'count': 12, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=1') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=2') + eq_(pagination['next'], 'http://example.com?page=2') + assert 'previous' not in pagination + + # Page 1 + query = { + 'count': 12, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': 1 + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=1') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=2') + eq_(pagination['next'], 'http://example.com?page=2') + assert 'previous' not in pagination + + # Page 2 + query = { + 'count': 12, + 'results': [x for x in range(2)], + } + data_dict = { + 'page': 2 + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=2') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=2') + eq_(pagination['previous'], 'http://example.com?page=1') + assert 'next' not in pagination + + # Page 3 + query = { + 'count': 12, + 'results': [x for x in range(2)], + } + data_dict = { + 'page': 3 + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=3') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=2') + eq_(pagination['previous'], 'http://example.com?page=2') + assert 'next' not in pagination + + @helpers.change_config('ckanext.dcat.datasets_per_page', 100) + @helpers.change_config('ckan.site_url', 'http://example.com') + @mock.patch('ckan.plugins.toolkit.request') + def test_pagination_less_results_than_page_size(self, mock_request): + + mock_request.params = {} + mock_request.host_url = 'http://ckan.example.com' + mock_request.path = '' + + # No page defined (defaults to 1) + query = { + 'count': 12, + 'results': [x for x in range(12)], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=1') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=1') + assert 'next' not in pagination + assert 'previous' not in pagination + + @helpers.change_config('ckanext.dcat.datasets_per_page', 10) + @helpers.change_config('ckan.site_url', 'http://example.com') + @mock.patch('ckan.plugins.toolkit.request') + def test_pagination_same_results_than_page_size(self, mock_request): + + mock_request.params = {} + mock_request.host_url = 'http://ckan.example.com' + mock_request.path = '' + + # No page defined (defaults to 1) + query = { + 'count': 10, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 10) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com?page=1') + eq_(pagination['first'], 'http://example.com?page=1') + eq_(pagination['last'], 'http://example.com?page=1') + assert 'next' not in pagination + assert 'previous' not in pagination + + @helpers.change_config('ckanext.dcat.datasets_per_page', 10) + @helpers.change_config('ckan.site_url', 'http://example.com') + @mock.patch('ckan.plugins.toolkit.request') + def test_pagination_keeps_only_supported_params(self, mock_request): + + mock_request.params = {'a': 1, 'b': 2, 'modified_since': '2018-03-22', 'profiles': 'schemaorg'} + mock_request.host_url = 'http://ckan.example.com' + mock_request.path = '/feed/catalog.xml' + + # No page defined (defaults to 1) + query = { + 'count': 12, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://example.com/feed/catalog.xml?modified_since=2018-03-22&profiles=schemaorg&page=1') + eq_(pagination['first'], 'http://example.com/feed/catalog.xml?modified_since=2018-03-22&profiles=schemaorg&page=1') + eq_(pagination['last'], 'http://example.com/feed/catalog.xml?modified_since=2018-03-22&profiles=schemaorg&page=2') + eq_(pagination['next'], 'http://example.com/feed/catalog.xml?modified_since=2018-03-22&profiles=schemaorg&page=2') + assert 'previous' not in pagination + + @helpers.change_config('ckanext.dcat.datasets_per_page', 10) + @helpers.change_config('ckan.site_url', '') + @mock.patch('ckan.plugins.toolkit.request') + def test_pagination_without_site_url(self, mock_request): + + mock_request.params = {} + mock_request.host_url = 'http://ckan.example.com' + mock_request.path = '/feed/catalog.xml' + + # No page defined (defaults to 1) + query = { + 'count': 12, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination['count'], 12) + eq_(pagination['items_per_page'], + config.get('ckanext.dcat.datasets_per_page')) + eq_(pagination['current'], 'http://ckan.example.com/feed/catalog.xml?page=1') + eq_(pagination['first'], 'http://ckan.example.com/feed/catalog.xml?page=1') + eq_(pagination['last'], 'http://ckan.example.com/feed/catalog.xml?page=2') + eq_(pagination['next'], 'http://ckan.example.com/feed/catalog.xml?page=2') + assert 'previous' not in pagination + + def test_pagination_no_results_empty_dict(self): + query = { + 'count': 0, + 'results': [], + } + data_dict = { + 'page': None + } + + pagination = _pagination_info(query, data_dict) + + eq_(pagination, {}) + + def test_pagination_wrong_page(self): + query = { + 'count': 10, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': 'a' + } + + assert_raises(toolkit.ValidationError, + _pagination_info, query, data_dict) + + def test_pagination_wrong_page_number(self): + query = { + 'count': 10, + 'results': [x for x in range(10)], + } + data_dict = { + 'page': '-1' + } + + assert_raises(toolkit.ValidationError, + _pagination_info, query, data_dict) + + +class TestActions(DCATFunctionalTestBase): + def test_dataset_show_with_format(self): + dataset = factories.Dataset( + notes='Test dataset' + ) + + content = helpers.call_action('dcat_dataset_show', id=dataset['id'], _format='xml') + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='xml') + + dcat_datasets = [d for d in p.datasets()] + + eq_(len(dcat_datasets), 1) + + dcat_dataset = dcat_datasets[0] + + eq_(dcat_dataset['title'], dataset['title']) + eq_(dcat_dataset['notes'], dataset['notes']) + + def test_dataset_show_without_format(self): + dataset = factories.Dataset( + notes='Test dataset' + ) + + content = helpers.call_action('dcat_dataset_show', id=dataset['id']) + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content) + + dcat_datasets = [d for d in p.datasets()] + + eq_(len(dcat_datasets), 1) + + dcat_dataset = dcat_datasets[0] + + eq_(dcat_dataset['title'], dataset['title']) + eq_(dcat_dataset['notes'], dataset['notes']) diff --git a/ckanext/dcat/tests/nose/test_schemaorg_profile_serialize.py b/ckanext/dcat/tests/nose/test_schemaorg_profile_serialize.py new file mode 100644 index 00000000..bbf931f3 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_schemaorg_profile_serialize.py @@ -0,0 +1,576 @@ +from builtins import str +import json + +import nose + +from ckantoolkit import config + +from dateutil.parser import parse as parse_date +from rdflib import URIRef, BNode, Literal +from rdflib.namespace import RDF + +from ckantoolkit.tests import helpers, factories + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import SCHEMA + +from ckanext.dcat.tests.nose.test_euro_dcatap_profile_serialize import BaseSerializeTest + +eq_ = nose.tools.eq_ +assert_true = nose.tools.assert_true + + +class TestSchemaOrgProfileSerializeDataset(BaseSerializeTest): + + def test_graph_from_dataset(self): + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'notes': 'Lorem ipsum', + 'url': 'http://example.com/ds1', + 'version': '1.0b', + 'metadata_created': '2015-06-26T15:21:09.034694', + 'metadata_modified': '2015-06-26T15:21:09.075774', + 'license_title': 'CC-BY 3.0', + 'license_url': 'http://creativecommons.org/licenses/by/3.0/', + 'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}], + 'extras': [ + {'key': 'alternate_identifier', 'value': '[\"xyz\", \"abc\"]'}, + {'key': 'identifier', 'value': '26be5452-fc5c-11e7-8450-fea9aa178066'}, + {'key': 'version_notes', 'value': 'This is a beta version'}, + {'key': 'frequency', 'value': 'monthly'}, + {'key': 'language', 'value': '[\"en\"]'}, + {'key': 'theme', 'value': '[\"http://eurovoc.europa.eu/100142\", \"http://eurovoc.europa.eu/100152\"]'}, + {'key': 'conforms_to', 'value': '[\"Standard 1\", \"Standard 2\"]'}, + {'key': 'access_rights', 'value': 'public'}, + {'key': 'documentation', 'value': '[\"http://dataset.info.org/doc1\", \"http://dataset.info.org/doc2\"]'}, + {'key': 'provenance', 'value': 'Some statement about provenance'}, + {'key': 'dcat_type', 'value': 'test-type'}, + {'key': 'related_resource', 'value': '[\"http://dataset.info.org/related1\", \"http://dataset.info.org/related2\"]'}, + {'key': 'has_version', 'value': '[\"https://data.some.org/catalog/datasets/derived-dataset-1\", \"https://data.some.org/catalog/datasets/derived-dataset-2\"]'}, + {'key': 'is_version_of', 'value': '[\"https://data.some.org/catalog/datasets/original-dataset\"]'}, + {'key': 'source', 'value': '[\"https://data.some.org/catalog/datasets/source-dataset-1\", \"https://data.some.org/catalog/datasets/source-dataset-2\"]'}, + {'key': 'sample', 'value': '[\"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample\"]'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(str(dataset_ref), utils.dataset_uri(dataset)) + + # Basic fields + assert self._triple(g, dataset_ref, RDF.type, SCHEMA.Dataset) + assert self._triple(g, dataset_ref, SCHEMA.name, dataset['title']) + assert self._triple(g, dataset_ref, SCHEMA.description, dataset['notes']) + assert self._triple(g, dataset_ref, SCHEMA.version, dataset['version']) + assert self._triple(g, dataset_ref, SCHEMA.license, dataset['license_url']) + assert self._triple(g, dataset_ref, SCHEMA.identifier, extras['identifier']) + url = self._triple(g, dataset_ref, SCHEMA.url, None)[2] + assert url + eq_(url, Literal('http://test.ckan.net/dataset/%s' % dataset['name'])) + + # Dates + assert self._triple(g, dataset_ref, SCHEMA.datePublished, dataset['metadata_created']) + assert self._triple(g, dataset_ref, SCHEMA.dateModified, dataset['metadata_modified']) + + # Tags + eq_(len([t for t in g.triples((dataset_ref, SCHEMA.keywords, None))]), 2) + for tag in dataset['tags']: + assert self._triple(g, dataset_ref, SCHEMA.keywords, tag['name']) + + # List + for item in [ + ('language', SCHEMA.inLanguage, Literal), + ]: + values = json.loads(extras[item[0]]) + eq_(len([t for t in g.triples((dataset_ref, item[1], None))]), len(values)) + for value in values: + assert self._triple(g, dataset_ref, item[1], item[2](value)) + + def test_publisher_extras(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + }, + 'extras': [ + {'key': 'publisher_uri', 'value': 'http://example.com/publisher'}, + {'key': 'publisher_name', 'value': 'Example Publisher'}, + {'key': 'publisher_email', 'value': 'publisher@example.com'}, + {'key': 'publisher_url', 'value': 'http://example.com/publisher/home'}, + {'key': 'publisher_type', 'value': 'http://purl.org/adms/publishertype/Company'}, + ] + + + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2] + assert publisher + eq_(str(publisher), extras['publisher_uri']) + assert self._triple(g, publisher, RDF.type, SCHEMA.Organization) + assert self._triple(g, publisher, SCHEMA.name, extras['publisher_name']) + + contact_point = self._triple(g, publisher, SCHEMA.contactPoint, None)[2] + assert contact_point + assert self._triple(g, contact_point, RDF.type, SCHEMA.ContactPoint) + assert self._triple(g, contact_point, SCHEMA.name, extras['publisher_name']) + assert self._triple(g, contact_point, SCHEMA.email, extras['publisher_email']) + assert self._triple(g, contact_point, SCHEMA.url, extras['publisher_url']) + assert self._triple(g, contact_point, SCHEMA.contactType, 'customer service') + + def test_publisher_org(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'organization': { + 'id': '', + 'name': 'publisher1', + 'title': 'Example Publisher from Org', + } + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + publisher = self._triple(g, dataset_ref, SCHEMA.publisher, None)[2] + assert publisher + assert self._triple(g, publisher, RDF.type, SCHEMA.Organization) + assert self._triple(g, publisher, SCHEMA.name, dataset['organization']['title']) + + def test_groups(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'groups': [ + { + 'id': 'geography', + 'name': 'geography', + 'display_name': 'Geography', + }, + { + 'id': 'statistics', + 'name': 'statistics', + 'display_name': 'Statistics', + }, + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + about = self._triples(g, dataset_ref, SCHEMA.about, None) + assert len(about) == 2, 'There are not exactly 2 groups' + + names = [] + urls = [] + + for item in about: + names.append(str(g.value(item[2], SCHEMA.name))) + urls.append(str(g.value(item[2], SCHEMA.url))) + + eq_(sorted(names), ['geography', 'statistics']) + eq_(sorted(urls), [ + 'http://test.ckan.net/group/geography', + 'http://test.ckan.net/group/statistics']) + + @helpers.change_config('ckan.site_url', 'http://ckan.example.org') + @helpers.change_config('ckan.site_description', 'CKAN Portal') + @helpers.change_config('ckan.site_title', 'ckan.example.org') + def test_catalog(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + } + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + data_catalog = self._triple(g, dataset_ref, SCHEMA.includedInDataCatalog, None)[2] + assert data_catalog + assert self._triple(g, data_catalog, RDF.type, SCHEMA.DataCatalog) + assert self._triple(g, data_catalog, SCHEMA.url, 'http://ckan.example.org') + assert self._triple(g, data_catalog, SCHEMA.name, 'ckan.example.org') + assert self._triple(g, data_catalog, SCHEMA.description, 'CKAN Portal') + + def test_temporal_start_and_end(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'temporal_start', 'value': '2015-06-26T15:21:09.075774'}, + {'key': 'temporal_end', 'value': '2015-07-14'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, SCHEMA.temporalCoverage, '2015-06-26T15:21:09.075774/2015-07-14') + + def test_temporal_start_only(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'temporal_start', 'value': '2015-06-26T15:21:09.075774'}, + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + assert self._triple(g, dataset_ref, SCHEMA.temporalCoverage, parse_date(extras['temporal_start']).isoformat()) + + def test_spatial(self): + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'extras': [ + {'key': 'spatial_uri', 'value': 'http://sws.geonames.org/6361390/'}, + {'key': 'spatial_text', 'value': 'Tarragona'}, + {'key': 'spatial', 'value': '{"type": "Polygon", "coordinates": [[[1.1870606,41.0786393],[1.1870606,41.1655218],[1.3752339,41.1655218],[1.3752339,41.0786393],[1.1870606,41.0786393]]]}'}, + + ] + } + extras = self._extras(dataset) + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + spatial = self._triple(g, dataset_ref, SCHEMA.spatialCoverage, None)[2] + assert spatial + eq_(str(spatial), extras['spatial_uri']) + assert self._triple(g, spatial, RDF.type, SCHEMA.Place) + assert self._triple(g, spatial, SCHEMA.description, extras['spatial_text']) + geo = self._triple(g, spatial, SCHEMA.geo, None)[2] + assert self._triple(g, geo, RDF.type, SCHEMA.GeoShape) + assert self._triple(g, geo, SCHEMA.polygon, extras['spatial']) + + def test_distributions(self): + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file' + }, + { + 'id': '8bceeda9-0084-477f-aa33-dad6148900d5', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'XLS file' + }, + { + 'id': 'da73d939-0f11-45a1-9733-5de108383133', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'PDF file' + }, + + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(len([t for t in g.triples((dataset_ref, SCHEMA.distribution, None))]), 3) + + for resource in dataset['resources']: + distribution = self._triple(g, + dataset_ref, + SCHEMA.distribution, + URIRef(utils.resource_uri(resource)))[2] + + assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload) + assert self._triple(g, distribution, SCHEMA.name, resource['name']) + + def test_distribution_fields(self): + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'description': 'A CSV file', + 'url': 'http://example.com/data/file.csv', + 'status': 'http://purl.org/adms/status/Completed', + 'rights': 'Some statement about rights', + 'license': 'http://creativecommons.org/licenses/by/3.0/', + 'issued': '2015-06-26T15:21:09.034694', + 'modified': '2015-06-26T15:21:09.075774', + 'size': 1234, + 'language': '[\"en\", \"es\", \"ca\"]', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + eq_(len([t for t in g.triples((dataset_ref, SCHEMA.distribution, None))]), 1) + + # URI + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + eq_(str(distribution), utils.resource_uri(resource)) + + # Basic fields + assert self._triple(g, distribution, RDF.type, SCHEMA.DataDownload) + assert self._triple(g, distribution, SCHEMA.name, resource['name']) + assert self._triple(g, distribution, SCHEMA.description, resource['description']) + assert self._triple(g, distribution, SCHEMA.license, resource['license']) + + # List + for item in [ + ('language', SCHEMA.inLanguage), + ]: + values = json.loads(resource[item[0]]) + eq_(len([t for t in g.triples((distribution, item[1], None))]), len(values)) + for value in values: + assert self._triple(g, distribution, item[1], value) + + # Dates + assert self._triple(g, distribution, SCHEMA.datePublished, resource['issued']) + assert self._triple(g, distribution, SCHEMA.dateModified, resource['modified']) + + # Numbers + assert self._triple(g, distribution, SCHEMA.contentSize, resource['size']) + + def test_distribution_access_url_only(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.url, resource['url']) + assert self._triple(g, distribution, SCHEMA.contentUrl, None) is None + + def test_distribution_download_url_only(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.contentUrl, resource['download_url']) + assert self._triple(g, distribution, SCHEMA.url, None) is None + + def test_distribution_both_urls_different(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.url, resource['url']) + assert self._triple(g, distribution, SCHEMA.contentUrl, resource['download_url']) + + def test_distribution_both_urls_the_same(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + 'download_url': 'http://example.com/data/file.csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.contentUrl, resource['url']) + assert self._triple(g, distribution, SCHEMA.url, None) is None + + def test_distribution_format(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + 'format': 'CSV', + 'mimetype': 'text/csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.encodingFormat, resource['format']) + + def test_distribution_format_with_mimetype_fallback(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + 'format': '', + 'mimetype': 'text/csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.encodingFormat, resource['mimetype']) + + def test_distribution_format_with_backslash(self): + + resource = { + 'id': 'c041c635-054f-4431-b647-f9186926d021', + 'package_id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'CSV file', + 'url': 'http://example.com/data/file.csv', + 'format': 'text/csv', + } + + dataset = { + 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', + 'name': 'test-dataset', + 'title': 'Test DCAT dataset', + 'resources': [ + resource + ] + } + + s = RDFSerializer(profiles=['schemaorg']) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + distribution = self._triple(g, dataset_ref, SCHEMA.distribution, None)[2] + + assert self._triple(g, distribution, SCHEMA.encodingFormat, resource['format']) + diff --git a/ckanext/dcat/tests/nose/test_utils.py b/ckanext/dcat/tests/nose/test_utils.py new file mode 100644 index 00000000..2c920806 --- /dev/null +++ b/ckanext/dcat/tests/nose/test_utils.py @@ -0,0 +1,129 @@ +from builtins import object +import nose + +from ckanext.dcat.utils import parse_accept_header + +eq_ = nose.tools.eq_ + + +class TestAcceptHeaders(object): + + def test_empty(self): + + header = '' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_basic_found(self): + + header = 'application/rdf+xml' + + _format = parse_accept_header(header) + + eq_(_format, 'rdf') + + def test_basic_not_found(self): + + header = 'image/gif' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_multiple(self): + + header = 'application/rdf+xml, application/ld+json' + + _format = parse_accept_header(header) + + eq_(_format, 'rdf') + + def test_multiple_not_found(self): + + header = 'image/gif, text/unknown' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_multiple_first_not_found(self): + + header = 'image/gif, application/ld+json, text/turtle' + + _format = parse_accept_header(header) + + eq_(_format, 'jsonld') + + def test_q_param(self): + + header = 'text/turtle; q=0.8' + + _format = parse_accept_header(header) + + eq_(_format, 'ttl') + + def test_q_param_multiple(self): + + header = 'text/turtle; q=0.8, text/n3; q=0.6' + + _format = parse_accept_header(header) + + eq_(_format, 'ttl') + + def test_q_param_multiple_first_not_found(self): + + header = 'image/gif; q=1.0, text/turtle; q=0.8, text/n3; q=0.6' + + _format = parse_accept_header(header) + + eq_(_format, 'ttl') + + def test_wildcard(self): + + header = 'text/*' + + _format = parse_accept_header(header) + + assert _format in ('ttl', 'n3') + + def test_wildcard_multiple(self): + + header = 'image/gif; q=1.0, text/*; q=0.5' + + _format = parse_accept_header(header) + + assert _format in ('ttl', 'n3') + + def test_double_wildcard(self): + + header = '*/*' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_double_wildcard_multiple(self): + + header = 'image/gif; q=1.0, text/csv; q=0.8, */*; q=0.1' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_html(self): + + header = 'text/html' + + _format = parse_accept_header(header) + + eq_(_format, None) + + def test_html_multiple(self): + + header = 'image/gif; q=1.0, text/html; q=0.8, text/turtle; q=0.6' + + _format = parse_accept_header(header) + + eq_(_format, None) diff --git a/ckanext/dcat/tests/test_base_parser.py b/ckanext/dcat/tests/test_base_parser.py index 0bf21487..ceba5ed4 100644 --- a/ckanext/dcat/tests/test_base_parser.py +++ b/ckanext/dcat/tests/test_base_parser.py @@ -1,6 +1,7 @@ from builtins import str from builtins import object -import nose + +import pytest from ckantoolkit import config @@ -20,8 +21,6 @@ DCT = Namespace("http://purl.org/dc/terms/") DCAT = Namespace("http://www.w3.org/ns/dcat#") -eq_ = nose.tools.eq_ - def _default_graph(): @@ -78,7 +77,7 @@ def test_default_profile(self): p = RDFParser() - eq_(sorted([pr.name for pr in p._profiles]), + assert (sorted([pr.name for pr in p._profiles]) == sorted(DEFAULT_RDF_PROFILES)) def test_profiles_via_config_option(self): @@ -90,7 +89,7 @@ def test_profiles_via_config_option(self): RDFParser() except RDFProfileException as e: - eq_(str(e), 'Unknown RDF profiles: profile_conf_1, profile_conf_2') + assert str(e), 'Unknown RDF profiles: profile_conf_1 == profile_conf_2' config.clear() config.update(original_config) @@ -100,14 +99,14 @@ def test_no_profile_provided(self): RDFParser(profiles=[]) except RDFProfileException as e: - eq_(str(e), 'No suitable RDF profiles could be loaded') + assert str(e) == 'No suitable RDF profiles could be loaded' def test_profile_not_found(self): try: RDFParser(profiles=['not_found']) except RDFProfileException as e: - eq_(str(e), 'Unknown RDF profiles: not_found') + assert str(e) == 'Unknown RDF profiles: not_found' def test_profiles_are_called_on_datasets(self): @@ -135,11 +134,11 @@ def test_parse_data(self): p = RDFParser() - eq_(len(p.g), 0) + assert len(p.g) == 0 p.parse(data) - eq_(len(p.g), 2) + assert len(p.g) == 2 def test_parse_pagination_next_page(self): @@ -162,7 +161,7 @@ def test_parse_pagination_next_page(self): p.parse(data) - eq_(p.next_page(), 'http://example.com/catalog.xml?page=2') + assert p.next_page() == 'http://example.com/catalog.xml?page=2' def test_parse_without_pagination(self): @@ -180,7 +179,7 @@ def test_parse_without_pagination(self): p.parse(data) - eq_(p.next_page(), None) + assert p.next_page() == None def test_parse_pagination_last_page(self): @@ -203,7 +202,7 @@ def test_parse_pagination_last_page(self): p.parse(data) - eq_(p.next_page(), None) + assert p.next_page() == None def test_parse_data_different_format(self): @@ -217,11 +216,11 @@ def test_parse_data_different_format(self): p = RDFParser() - eq_(len(p.g), 0) + assert len(p.g) == 0 p.parse(data, _format='n3') - eq_(len(p.g), 2) + assert len(p.g) == 2 def test_parse_data_raises_on_parse_error(self): @@ -229,12 +228,14 @@ def test_parse_data_raises_on_parse_error(self): data = 'Wrong data' - nose.tools.assert_raises(RDFParserException, p.parse, '') + with pytest.raises(RDFParserException): + p.parse('') - nose.tools.assert_raises(RDFParserException, p.parse, data) + with pytest.raises(RDFParserException): + p.parse(data) - nose.tools.assert_raises(RDFParserException, p.parse, data, - _format='n3',) + with pytest.raises(RDFParserException): + p.parse(data, _format='n3') def test__datasets(self): @@ -242,7 +243,7 @@ def test__datasets(self): p.g = _default_graph() - eq_(len([d for d in p._datasets()]), 3) + assert len([d for d in p._datasets()]) == 3 def test__datasets_none_found(self): @@ -250,7 +251,7 @@ def test__datasets_none_found(self): p.g = Graph() - eq_(len([d for d in p._datasets()]), 0) + assert len([d for d in p._datasets()]) == 0 def test_datasets(self): @@ -265,7 +266,7 @@ def test_datasets(self): datasets.append(dataset) - eq_(len(datasets), 3) + assert len(datasets) == 3 def test_datasets_none_found(self): @@ -273,4 +274,4 @@ def test_datasets_none_found(self): p.g = Graph() - eq_(len([d for d in p.datasets()]), 0) + assert len([d for d in p.datasets()]) == 0 diff --git a/ckanext/dcat/tests/test_base_profile.py b/ckanext/dcat/tests/test_base_profile.py index 916967b7..d47bba0b 100644 --- a/ckanext/dcat/tests/test_base_profile.py +++ b/ckanext/dcat/tests/test_base_profile.py @@ -1,19 +1,17 @@ from builtins import str from builtins import object -import nose + +import pytest +from six import string_types from rdflib import Graph, URIRef, Literal from rdflib.namespace import Namespace -from ckantoolkit.tests import helpers - from ckanext.dcat.profiles import RDFProfile, CleanedURIRef from ckanext.dcat.tests.test_base_parser import _default_graph -eq_ = nose.tools.eq_ - DCT = Namespace("http://purl.org/dc/terms/") TEST = Namespace("http://test.org/") DCAT = Namespace("http://www.w3.org/ns/dcat#") @@ -24,33 +22,33 @@ class TestURIRefPreprocessing(object): def test_with_valid_items(self): testUriPart = "://www.w3.org/ns/dcat#" - + for prefix in ['http', 'https']: - eq_(CleanedURIRef(prefix + testUriPart), URIRef(prefix + testUriPart)) + assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + testUriPart) # leading and trailing whitespace should be removed - eq_(CleanedURIRef(' ' + prefix + testUriPart + ' '), URIRef(prefix + testUriPart)) + assert CleanedURIRef(' ' + prefix + testUriPart + ' ') == URIRef(prefix + testUriPart) testNonHttpUri = "mailto:someone@example.com" - eq_(CleanedURIRef(testNonHttpUri), URIRef(testNonHttpUri)) + assert CleanedURIRef(testNonHttpUri) == URIRef(testNonHttpUri) # leading and trailing whitespace should be removed again - eq_(CleanedURIRef(' ' + testNonHttpUri + ' '), URIRef(testNonHttpUri)) + assert CleanedURIRef(' ' + testNonHttpUri + ' ') == URIRef(testNonHttpUri) def test_with_invalid_items(self): testUriPart = "://www.w3.org/ns/!dcat #" expectedUriPart = "://www.w3.org/ns/%21dcat%20#" - + for prefix in ['http', 'https']: - eq_(CleanedURIRef(prefix + testUriPart), URIRef(prefix + expectedUriPart)) + assert CleanedURIRef(prefix + testUriPart) == URIRef(prefix + expectedUriPart) # applying on escaped data should have no effect - eq_(CleanedURIRef(prefix + expectedUriPart), URIRef(prefix + expectedUriPart)) + assert CleanedURIRef(prefix + expectedUriPart) == URIRef(prefix + expectedUriPart) # leading and trailing space should not be escaped testNonHttpUri = " mailto:with space!@example.com " expectedNonHttpUri = "mailto:with%20space%21@example.com" - eq_(CleanedURIRef(testNonHttpUri), URIRef(expectedNonHttpUri)) + assert CleanedURIRef(testNonHttpUri) == URIRef(expectedNonHttpUri) # applying on escaped data should have no effect - eq_(CleanedURIRef(expectedNonHttpUri), URIRef(expectedNonHttpUri)) + assert CleanedURIRef(expectedNonHttpUri) == URIRef(expectedNonHttpUri) class TestBaseRDFProfile(object): @@ -59,13 +57,13 @@ def test_datasets(self): p = RDFProfile(_default_graph()) - eq_(len([d for d in p._datasets()]), 3) + assert len([d for d in p._datasets()]) == 3 def test_datasets_none_found(self): p = RDFProfile(Graph()) - eq_(len([d for d in p._datasets()]), 0) + assert len([d for d in p._datasets()]) == 0 def test_distributions(self): @@ -73,11 +71,11 @@ def test_distributions(self): for dataset in p._datasets(): if str(dataset) == 'http://example.org/datasets/1': - eq_(len([d for d in p._distributions(dataset)]), 2) + assert len([d for d in p._distributions(dataset)]) == 2 elif str(dataset) == 'http://example.org/datasets/2': - eq_(len([d for d in p._distributions(dataset)]), 1) + assert len([d for d in p._distributions(dataset)]) == 1 elif str(dataset) == 'http://example.org/datasets/3': - eq_(len([d for d in p._distributions(dataset)]), 0) + assert len([d for d in p._distributions(dataset)]) == 0 def test_object(self): @@ -87,7 +85,7 @@ def test_object(self): DCT.title) assert isinstance(_object, Literal) - eq_(str(_object), 'Test Dataset 1') + assert str(_object) == 'Test Dataset 1' def test_object_not_found(self): @@ -96,7 +94,7 @@ def test_object_not_found(self): _object = p._object(URIRef('http://example.org/datasets/1'), DCT.unknown_property) - eq_(_object, None) + assert _object == None def test_object_value(self): @@ -105,8 +103,8 @@ def test_object_value(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, str) - eq_(value, 'Test Dataset 1') + assert isinstance(value, string_types) + assert value == 'Test Dataset 1' def test_object_value_not_found(self): @@ -115,9 +113,9 @@ def test_object_value_not_found(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.unknown_property) - eq_(value, '') + assert value == '' - @helpers.change_config('ckan.locale_default', 'de') + @pytest.mark.ckan_config('ckan.locale_default', 'de') def test_object_value_default_lang(self): p = RDFProfile(_default_graph()) @@ -129,10 +127,10 @@ def test_object_value_default_lang(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, str) - eq_(value, 'Test Datensatz 1') + assert isinstance(value, string_types) + assert value == 'Test Datensatz 1' - @helpers.change_config('ckan.locale_default', 'fr') + @pytest.mark.ckan_config('ckan.locale_default', 'fr') def test_object_value_default_lang_not_in_graph(self): p = RDFProfile(_default_graph()) @@ -142,7 +140,7 @@ def test_object_value_default_lang_not_in_graph(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, str) + assert isinstance(value, string_types) # FR is not in graph, so either node may be used assert value.startswith('Test D') assert value.endswith(' 1') @@ -158,9 +156,9 @@ def test_object_value_default_lang_fallback(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, str) + assert isinstance(value, string_types) # without config parameter, EN is used as default - eq_(value, 'Test Dataset 1 (EN)') + assert value == 'Test Dataset 1 (EN)' def test_object_value_default_lang_missing_lang_param(self): p = RDFProfile(_default_graph()) @@ -168,8 +166,8 @@ def test_object_value_default_lang_missing_lang_param(self): value = p._object_value(URIRef('http://example.org/datasets/1'), DCT.title) - assert isinstance(value, str) - eq_(value, 'Test Dataset 1') + assert isinstance(value, string_types) + assert value == 'Test Dataset 1' def test_object_int(self): @@ -183,7 +181,7 @@ def test_object_int(self): TEST.some_number) assert isinstance(value, int) - eq_(value, 23) + assert value == 23 def test_object_int_decimal(self): @@ -197,7 +195,7 @@ def test_object_int_decimal(self): TEST.some_number) assert isinstance(value, int) - eq_(value, 23) + assert value == 23 def test_object_int_not_found(self): @@ -206,7 +204,7 @@ def test_object_int_not_found(self): value = p._object_value_int(URIRef('http://example.org/datasets/1'), TEST.some_number) - eq_(value, None) + assert value == None def test_object_int_wrong_value(self): @@ -219,7 +217,7 @@ def test_object_int_wrong_value(self): value = p._object_value_int(URIRef('http://example.org/datasets/1'), TEST.some_number) - eq_(value, None) + assert value == None def test_object_list(self): @@ -236,9 +234,9 @@ def test_object_list(self): DCAT.keyword) assert isinstance(value, list) - assert isinstance(value[0], str) - eq_(len(value), 2) - eq_(sorted(value), ['moon', 'space']) + assert isinstance(value[0], string_types) + assert len(value) == 2 + assert sorted(value), ['moon' == 'space'] def test_object_list_not_found(self): @@ -248,7 +246,7 @@ def test_object_list_not_found(self): TEST.some_list) assert isinstance(value, list) - eq_(value, []) + assert value == [] def test_time_interval_schema_org(self): @@ -277,8 +275,8 @@ def test_time_interval_schema_org(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - eq_(start, '1905-03-01') - eq_(end, '2013-01-05') + assert start == '1905-03-01' + assert end == '2013-01-05' def test_time_interval_w3c_time(self): @@ -315,8 +313,8 @@ def test_time_interval_w3c_time(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - eq_(start, '1904-01-01') - eq_(end, '2014-03-22') + assert start == '1904-01-01' + assert end == '2014-03-22' def test_publisher_foaf(self): @@ -347,11 +345,11 @@ def test_publisher_foaf(self): publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) - eq_(publisher['uri'], 'http://orgs.vocab.org/some-org') - eq_(publisher['name'], 'Publishing Organization for dataset 1') - eq_(publisher['email'], 'contact@some.org') - eq_(publisher['url'], 'http://some.org') - eq_(publisher['type'], 'http://purl.org/adms/publishertype/NonProfitOrganisation') + assert publisher['uri'] == 'http://orgs.vocab.org/some-org' + assert publisher['name'] == 'Publishing Organization for dataset 1' + assert publisher['email'] == 'contact@some.org' + assert publisher['url'] == 'http://some.org' + assert publisher['type'] == 'http://purl.org/adms/publishertype/NonProfitOrganisation' def test_publisher_ref(self): @@ -374,7 +372,7 @@ def test_publisher_ref(self): publisher = p._publisher(URIRef('http://example.org'), DCT.publisher) - eq_(publisher['uri'], 'http://orgs.vocab.org/some-org') + assert publisher['uri'] == 'http://orgs.vocab.org/some-org' def test_contact_details(self): @@ -404,6 +402,6 @@ def test_contact_details(self): contact = p._contact_details(URIRef('http://example.org'), ADMS.contactPoint) - eq_(contact['name'], 'Point of Contact') + assert contact['name'] == 'Point of Contact' # mailto gets removed for storage and is added again on output - eq_(contact['email'], 'contact@some.org') + assert contact['email'] == 'contact@some.org' diff --git a/ckanext/dcat/tests/test_blueprints.py b/ckanext/dcat/tests/test_blueprints.py new file mode 100644 index 00000000..cc9fc3ac --- /dev/null +++ b/ckanext/dcat/tests/test_blueprints.py @@ -0,0 +1,583 @@ +# -*- coding: utf-8 -*- +from builtins import str +from builtins import range +import time +import nose + +import pytest + +from ckan import plugins as p +from ckan.lib.helpers import url_for + +from rdflib import Graph + +from ckantoolkit.tests import helpers, factories + +from ckanext.dcat.processors import RDFParser +from ckanext.dcat.profiles import RDF, DCAT +from ckanext.dcat.processors import HYDRA + + +@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') +class TestEndpoints(): + + def _object_value(self, graph, subject, predicate): + + objects = [o for o in graph.objects(subject, predicate)] + return str(objects[0]) if objects else None + + def test_dataset_default(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='rdf') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'application/rdf+xml' + + content = response.body + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='xml') + + dcat_datasets = [d for d in p.datasets()] + + assert len(dcat_datasets) == 1 + + dcat_dataset = dcat_datasets[0] + + assert dcat_dataset['title'] == dataset['title'] + assert dcat_dataset['notes'] == dataset['notes'] + + def test_dataset_xml(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='xml') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'application/rdf+xml' + + content = response.body + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='xml') + + dcat_datasets = [d for d in p.datasets()] + + assert len(dcat_datasets) == 1 + + dcat_dataset = dcat_datasets[0] + + assert dcat_dataset['title'] == dataset['title'] + assert dcat_dataset['notes'] == dataset['notes'] + + def test_dataset_ttl(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='ttl') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'text/turtle' + + content = response.body + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='turtle') + + dcat_datasets = [d for d in p.datasets()] + + assert len(dcat_datasets) == 1 + + dcat_dataset = dcat_datasets[0] + + assert dcat_dataset['title'] == dataset['title'] + assert dcat_dataset['notes'] == dataset['notes'] + + def test_dataset_n3(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='n3') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'text/n3' + + content = response.body + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='n3') + + dcat_datasets = [d for d in p.datasets()] + + assert len(dcat_datasets) == 1 + + dcat_dataset = dcat_datasets[0] + + assert dcat_dataset['title'] == dataset['title'] + assert dcat_dataset['notes'] == dataset['notes'] + + def test_dataset_jsonld(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'application/ld+json' + + content = response.body + + # Parse the contents to check it's an actual serialization + p = RDFParser() + + p.parse(content, _format='json-ld') + + dcat_datasets = [d for d in p.datasets()] + + assert len(dcat_datasets) == 1 + + dcat_dataset = dcat_datasets[0] + + assert dcat_dataset['title'] == dataset['title'] + assert dcat_dataset['notes'] == dataset['notes'] + + def test_dataset_profiles_jsonld(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='schemaorg') + + response = app.get(url) + + assert response.headers['Content-Type'] == 'application/ld+json' + + content = response.body + + assert '"@type": "schema:Dataset"' in content + assert '"schema:description": "%s"' % dataset['notes'] in content + + def test_dataset_profiles_not_found(self, app): + + dataset = factories.Dataset( + notes='Test dataset' + ) + + url = url_for('dcat.read_dataset', _id=dataset['name'], _format='jsonld', profiles='nope') + + response = app.get(url, status=409) + + assert 'Unknown RDF profiles: nope' in response.body + + def test_dataset_not_found(self, app): + import uuid + + url = url_for('dcat.read_dataset', _id=str(uuid.uuid4()), _format='n3') + + app.get(url, status=404) + + def test_dataset_form_is_rendered(self, app): + sysadmin = factories.Sysadmin() + env = {'REMOTE_USER': sysadmin['name'].encode('ascii')} + url = url_for('dataset.new') + + response = app.get(url, extra_environ=env) + + content = response.body + + assert '' in response.body + assert '"schema:description": "test description"' in response.body + + def test_structured_data_not_generated(self, app): + + dataset = factories.Dataset( + notes='test description' + ) + + url = url_for('dataset.read', id=dataset['name']) + + response = app.get(url) + assert not '