From e5d2222a5ba4d69f9d0b6330d427aa62f1eba11a Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:29:23 +0100 Subject: [PATCH 1/2] Add rating calculation and update results file format --- ckan2mqa/controller/mqa_evaluate.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/ckan2mqa/controller/mqa_evaluate.py b/ckan2mqa/controller/mqa_evaluate.py index 1ab90f9..4ced86a 100644 --- a/ckan2mqa/controller/mqa_evaluate.py +++ b/ckan2mqa/controller/mqa_evaluate.py @@ -9,6 +9,7 @@ # Info +WEIGHT_TOTAL = 405 FINDABILITY = 'Findability' ACCESIBILITY = 'Accessibility' INTEROPERABILITY = 'Interoperability' @@ -263,11 +264,21 @@ def print(self, dimension, property, count, population, weight): else: percentage = 0 if count > 0: - partialPoints = percentage * weight + partialPoints = round(percentage * weight, 2) self.totalPoints += partialPoints else: partialPoints = 0 - self.results_file.write(dimension + "\t" + property + "\t" + str(count) + "\t" + str(population) + "\t" + str(percentage) + "\t" + str(partialPoints) + "\t" + str(weight)+"\n") + self.results_file.write(f"{dimension}\t{property}\t{count}\t{population}\t{round(percentage, 2)}\t{partialPoints}\t{weight}\n") + + def get_rating(self): + if self.totalPoints >= 351: + return "Excellent" + elif self.totalPoints >= 221: + return "Good" + elif self.totalPoints >= 121: + return "Sufficient" + else: + return "Bad" def findability_keywords_available(self): dimension = FINDABILITY @@ -532,7 +543,7 @@ def contextuality_modified_available(self): self.print(dimension, property, count, population, 5) def evaluate(self): - self.results_file.write("Dimension\tIndicator/property\tCount\tPopulation\tPercentage\tPoints\tWeight\n") + self.results_file.write(f"Dimension\tIndicator/property\tCount\tPopulation\tPercentage\tPoints\tWeight\n") self.findability_keywords_available() self.findability_category_available() self.findability_spatial_available() @@ -556,6 +567,9 @@ def evaluate(self): self.contextuality_fileSize_available() self.contextuality_issued_available() self.contextuality_modified_available() - self.results_file.write("Total points\t"+ str(round(self.totalPoints, 2))+'\n') + + self.results_file.write(f"Total points\tRating: {self.get_rating()}\t\t\t{round(self.totalPoints/WEIGHT_TOTAL, 2)}\t{round(self.totalPoints, 2)}\t{WEIGHT_TOTAL}\n") self.results_file.close() - logging.info(f"{log_module}:{self.catalog_filename} total points: {str(round(self.totalPoints, 2))}/405") + logging.info(f"{log_module}:{self.catalog_filename} total points: {round(self.totalPoints, 2)}/{WEIGHT_TOTAL}") + + From 709253125db783fea7efdb55d9c40ab730da7d64 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:29:49 +0100 Subject: [PATCH 2/2] Add MQA documentation to README.md --- README.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b4211c1..7e01e66 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,79 @@ This Docker Compose configuration integrates the powerful MQA toolset seamlessly with CKAN endpoints and European Data Portal catalogs, enabling users to perform in-depth assessments of metadata quality effortlessly. The setup provides an efficient way to run comprehensive quality checks on various metadata attributes, including data relevance, schema compliance, data format consistency, and adherence to standard vocabularies. -![5 MQA_dimensions png](https://github.com/mjanez/ckan-mqa/assets/96422458/0c54d8c3-e454-4a6a-bcd6-ebc0a0dae080) - - >**Note**
> It can be tested with an open data portal of the CKAN type such as: [mjanez/ckan-docker](https://github.com/mjanez/ckan-docker)[^1] +### [Metadata Quality Assessment Methodology](https://data.europa.eu/mqa/methodology) +The MQA measures the quality of various indicators, each indicator is explained in the tables below. The results of the checks are stored as Data Quality Vocabulary ([DQV](https://www.w3.org/TR/vocab-dqv/)) . DQV is a specification of the W3C that is used to describe the quality of a dataset. + + **Dimension** | **Maximal points** +:----------------:|:------------------: + Findability | 100 + Accessibility | 100 + Interoperability | 110 + Reusability | 75 + Contextuality | 20 + *Sum* | 405 + +The dimensions are derived from the FAIR principles: +* **Findability** +The following table describes the metrics that help people and machines in finding datasets. A maximum of 100 points can be scored in this area. + +* **Accessibility** +The following table describes which metrics are used to determine whether access to the data referenced by the distributions is guaranteed. A maximum of 100 points can be scored in this area. + +* **Interoperability** +The following table describes the metrics used to determine whether a distribution is considered interoperable. According to the assumption 'identical content with several distributions', only the distribution with the highest number of points is used to calculate the points. A maximum of 110 points can be scored in this area + +* **Reusability** +The following table describes which metrics are used to check the reusability of the data. A maximum of 75 points can be scored in this area. + +* **Contextuality** +The following table show some light weight properties, that provide more context to the user. A maximum of 20 points can be scored in this area. + +![5 MQA_dimensions png](https://github.com/mjanez/ckan-mqa/assets/96422458/0c54d8c3-e454-4a6a-bcd6-ebc0a0dae080) + +The final rating happens via four rating groups. The mapping of the points to the rating category is shown in the table below. The representation of the rating in the MQA is expressed exclusively via the rating categories. This enables providers to achieve the highest rating even with a slight deduction of points. + + **Rating** | **Range of points** +:----------:|:-------------------: + Excellent | 351 - 405 + Good | 221 – 350 + Sufficient | 121 – 220 + Bad | 0 - 120 + + +#### Example of ckan-mqa results summary + + **Dimension** | **Indicator/property** | **Count** | **Population** | **Percentage** | **Points** | **Weight** +:----------------:|:-----------------------------------------:|:---------:|:--------------:|:--------------:|:----------:|:----------: + Findability | dcat:keyword | 46 | 46 | 1.0 | 30.0 | 30 + Findability | dcat:theme | 46 | 46 | 1.0 | 30.0 | 30 + Findability | dct:spatial | 42 | 46 | 0.91 | 18.26 | 20 + Findability | dct:temporal | 0 | 46 | 0.0 | 0 | 20 + Accessibility | dcat:accessURL code=200 | 255 | 255 | 1.0 | 50.0 | 50 + Accessibility | dcat:downloadURL | 0 | 255 | 0.0 | 0 | 20 + Accessibility | dcat:downloadURL code=200 | 0 | 255 | 0.0 | 0 | 30 + Interoperability | dct:format | 255 | 255 | 1.0 | 20.0 | 20 + Interoperability | dcat:mediaType | 255 | 255 | 1.0 | 10.0 | 10 + Interoperability | dct:format/dcat:mediaType from vocabulary | 378 | 510 | 0.74 | 7.41 | 10 + Interoperability | dct:format non-proprietary | 131 | 255 | 0.51 | 10.27 | 20 + Interoperability | dct:format machine-readable | 252 | 255 | 0.99 | 19.76 | 20 + Interoperability | DCAT-AP compliance | 0 | 46 | 0.0 | 0 | 30 + Reusability | dct:license | 255 | 255 | 1.0 | 20.0 | 20 + Reusability | dct:license from vocabulary | 245 | 255 | 0.96 | 9.61 | 10 + Reusability | dct:accessRights | 46 | 46 | 1.0 | 10.0 | 10 + Reusability | dct:accessRights from vocabulary | 0 | 46 | 0.0 | 0 | 5 + Reusability | dcat:contactPoint | 46 | 46 | 1.0 | 20.0 | 20 + Reusability | dct:publisher | 46 | 46 | 1.0 | 10.0 | 10 + Contextuality | dct:rights | 255 | 255 | 1.0 | 5.0 | 5 + Contextuality | dcat:byteSize | 0 | 255 | 0.0 | 0 | 5 + Contextuality | dct:issued | 46 | 46 | 1.0 | 5.0 | 5 + Contextuality | dct:modified | 46 | 46 | 1.0 | 5.0 | 5 + Total points | Rating: Good | | | 0.69 | 280.31 | 405 + + ## Quick start First copy the `.env.example` template as `.env` and configure by changing the `CKAN_CATALOG_URL`, as well as the DCAT-AP Profile version (`DCATAP_FILES_VERSION`), if needed.