diff --git a/datahugger/base.py b/datahugger/base.py index cd7797b..66c76c7 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -11,7 +11,7 @@ from urllib.parse import urlparse import requests -from jsonpath_ng import parse +from jsonpath_ng.ext import parse from scitree import scitree from tqdm import tqdm diff --git a/datahugger/config.py b/datahugger/config.py index 2c5e90f..45899dc 100644 --- a/datahugger/config.py +++ b/datahugger/config.py @@ -1,5 +1,6 @@ from datahugger.services import ArXivDataset from datahugger.services import DataDryadDataset +from datahugger.services import DataEuropaDataset from datahugger.services import DataOneDataset from datahugger.services import DataverseDataset from datahugger.services import DjehutyDataset @@ -117,6 +118,7 @@ "trolling.uit.no": DataverseDataset, "www.sodha.be": DataverseDataset, "www.uni-hildesheim.de": DataverseDataset, + "data.europa.eu": DataEuropaDataset, } # regexp lookup diff --git a/datahugger/services.py b/datahugger/services.py index 3a0ce60..60fc9f2 100644 --- a/datahugger/services.py +++ b/datahugger/services.py @@ -390,6 +390,23 @@ def _get_attr_hash_type(self, record): return self._get_attr_attr(record, self.ATTR_HASH_JSONPATH).split(":")[0] +class DataEuropaDataset(DatasetDownloader): + """Downloader for European data repository.""" + + REGEXP_ID = r"data\.europa\.eu\/data\/datasets\/(?P.+)" + + # the base entry point of the REST API + API_URL = "https://data.europa.eu/api/hub/repo/" + + API_URL_META = "{api_url}datasets/{record_id}" + META_FILES_JSONPATH = '$.@graph[?(@.@type == "dcat:Distribution")]' + + # paths to file attributes + ATTR_FILE_LINK_JSONPATH = "'dcat:accessURL'.@id" + ATTR_NAME_JSONPATH = "'dct:title'" + ATTR_SIZE_JSONPATH = "'dcat:byteSize'.@value" + + class SeaNoeDataset(DatasetDownloader): """Downloader for SeaNoe publication.""" diff --git a/tests/test_repositories.toml b/tests/test_repositories.toml index ce22f08..a19e000 100644 --- a/tests/test_repositories.toml +++ b/tests/test_repositories.toml @@ -110,6 +110,10 @@ files = "AA_age.tab" location = "https://github.com/j535d165/cbsodata" files = "cbsodata-main/README.md" +[[dataeuropa]] +location = "https://data.europa.eu/data/datasets/65e092e4009f18f050b14216" +files = "consolidation-wattzhub-schema-irve-dynamic-20240918-033000.csv" + [[seanoe]] location = "https://doi.org/10.17882/101042" files = "111609.xlsx"