Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specs compliance #45

Merged
merged 7 commits into from
Jul 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ RUN true \
&& true

USER dp
RUN find examples -name '*.json' -print0 | xargs -n 1 -0 python scripts/validate_dpkg.py --log-level=DEBUG
CMD ["make", "test"]
11 changes: 11 additions & 0 deletions biotracks/createdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,17 @@
import csv
import io
import os
import re

import datapackage as dp
from jsontableschema import infer
from .names import OBJECTS_TABLE_NAME, LINKS_TABLE_NAME


# https://specs.frictionlessdata.io/data-package/#metadata
NAME_PATTERN = re.compile(r"^[a-z0-9_.-]+$")


def create_dpkg(top_level_dict, dict_, directory, joint_id):
"""Create the datapackage representation.

Expand All @@ -42,6 +47,12 @@ def create_dpkg(top_level_dict, dict_, directory, joint_id):
directory -- the directory
joint_id -- the joint_identifier
"""
try:
name = top_level_dict["name"]
except KeyError:
raise ValueError("'name' is a required property")
if not NAME_PATTERN.match(name):
raise ValueError("invalid name: %r" % (name,))

myDP = dp.DataPackage()

Expand Down
168 changes: 168 additions & 0 deletions biotracks/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# #%L
# Copyright (c) 2016-2017 Cell Migration Standardisation Organization
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# #L%

import datapackage
import datapackage.registry
import datapackage.exceptions

from . import names
from .utils import get_logger


REQUIRED_FIELDS = {
names.OBJECTS_TABLE_NAME: {names.OBJECT_NAME, names.FRAME_NAME,
names.X_COORD_NAME, names.Y_COORD_NAME},
names.LINKS_TABLE_NAME: {names.LINK_NAME, names.OBJECT_NAME},
names.TRACKS_TABLE_NAME: {names.TRACK_NAME, names.LINK_NAME},
}

FOREIGN_KEYS = [
{"fields": names.OBJECT_NAME,
"reference": {"fields": names.OBJECT_NAME,
"resource": names.OBJECTS_TABLE_NAME}}
]


def is_tabular(dp):
if not isinstance(dp, datapackage.DataPackage):
return False
reg = datapackage.registry.Registry()
return dp.schema.to_dict() == reg.get('tabular')


class ValidationError(datapackage.exceptions.ValidationError):
pass


class Validator(object):

def __init__(self, log_level=None):
self.logger = get_logger(self.__class__.__name__, level=log_level)

def validate(self, dp):
if isinstance(dp, datapackage.DataPackage) and not is_tabular(dp):
raise ValueError("data package must be a tabular data package")
else:
dp = datapackage.DataPackage(dp, schema="tabular")
dp.validate()
self.logger.debug("valid tabular data package")
if len(dp.resources) < 2:
self.__error("data package must have at least two resources")
res_map = dict((_.descriptor['name'], _) for _ in dp.resources)
try:
objects = res_map[names.OBJECTS_TABLE_NAME]
except KeyError:
self.__error("objects table not found")
else:
self.validate_objects(objects.descriptor)
try:
links = res_map[names.LINKS_TABLE_NAME]
except KeyError:
self.__error("links table not found")
else:
self.validate_links(links.descriptor)
try:
tracks = res_map[names.TRACKS_TABLE_NAME]
except KeyError:
pass
else:
self.validate_tracks(tracks.descriptor)

def validate_objects(self, objects):
try:
pk = objects["schema"]["primaryKey"]
except KeyError:
self.__error("objects table schema has no primary key")
if pk != names.OBJECT_NAME:
self.__error(
"objects table primary key must be %r" % (names.OBJECT_NAME,)
)
by_name = self.__check_required_fields(objects)
id_field = by_name[names.OBJECT_NAME]
try:
constraints = id_field["constraints"]
except KeyError:
self.__error("object id field has no constraints")
try:
unique = constraints["unique"]
except KeyError:
self.__error("object id constraints: missing 'unique' property")
if not unique:
self.__error("object id constraints: 'unique' property is false")

def validate_links(self, links):
try:
fk = links["schema"]["foreignKeys"]
except KeyError:
self.__error("objects table schema has no foreign keys")
self.validate_foreign_keys(fk)
self.__check_required_fields(links)

def validate_tracks(self, tracks):
self.__check_required_fields(tracks)

def validate_foreign_keys(self, fk):
if len(fk) != 1:
self.__error("links table must have exactly one foreign key")
fk = fk[0]
try:
fields = fk["fields"]
ref = fk["reference"]
except KeyError as e:
self.__error("missing property in foreignKeys: %r" % e.args)
if fields != names.OBJECT_NAME:
self.__error(
"foreignKeys fields must be %r" % (names.OBJECT_NAME,)
)
try:
ref_fields = ref["fields"]
ref_res = ref["resource"]
except KeyError as e:
self.__error(
"missing property in foreignKeys reference: %r" % e.args
)
if ref_fields != names.OBJECT_NAME:
self.__error(
"foreignKeys ref fields must be %r" % (names.OBJECT_NAME,)
)
if ref_res != names.OBJECTS_TABLE_NAME:
self.__error(
"foreignKeys ref resource must be %r" % (
names.OBJECTS_TABLE_NAME,)
)

def __check_required_fields(self, descriptor):
required = REQUIRED_FIELDS[descriptor['name']]
by_name = dict((_['name'], _) for _ in descriptor['schema']['fields'])
if not required <= set(by_name):
self.__error(
"required fields for %s: %r" % (descriptor['name'], required)
)
return by_name

def __error(self, msg):
self.logger.error(msg)
raise ValidationError(msg)
63 changes: 63 additions & 0 deletions scripts/validate_dpkg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# #%L
# Copyright (c) 2016-2017 Cell Migration Standardisation Organization
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# #L%

"""\
Validate a CMSO datapackage.
"""

import sys
import argparse

from biotracks.validation import Validator
from biotracks.utils import get_log_level, get_logger


def log_level(s):
try:
return get_log_level(s)
except ValueError as e:
raise argparse.ArgumentTypeError(e.message)


def make_parser():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('dp_fn', metavar="JSON_FILE")
parser.add_argument('--log-level', metavar='LEVEL', type=log_level,
default='INFO', help='logging level')
return parser


def main(argv):
parser = make_parser()
args = parser.parse_args(argv[1:])
logger = get_logger('validate_dpkg', level=args.log_level, f=sys.stdout)
validator = Validator(log_level=args.log_level)
validator.validate(args.dp_fn)
logger.debug("%r: OK" % (args.dp_fn))


if __name__ == "__main__":
main(sys.argv)
3 changes: 3 additions & 0 deletions tests/test_createdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,6 @@ def __check_dps(self, d):
tld = d['conf']['TOP_LEVEL_INFO']
dp = createdp.create_dpkg(tld, {}, d['dp_dir'], names.OBJECT_NAME)
assert dp.to_dict() == d['dp'].to_dict()
tld['name'] = "CMSO_TRACKS"
with pytest.raises(ValueError):
createdp.create_dpkg(tld, {}, d['dp_dir'], names.OBJECT_NAME)
Loading