Skip to content

Commit

Permalink
feat: add zenodo download module
Browse files Browse the repository at this point in the history
  • Loading branch information
makkus committed Mar 7, 2024
1 parent cb9e473 commit f14db56
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 13 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ plugins = [
# mypy per-module options:
[[tool.mypy.overrides]]
module = [
"pyzenodo3.*"
"patoolib.*",
"pyzenodo3.*",
"ruamel.*",
]
ignore_missing_imports = true
29 changes: 23 additions & 6 deletions src/kiara_plugin/onboarding/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def create_outputs_schema(

def process(self, inputs: ValueMap, outputs: ValueMap):

from kiara.models.filesystem import FolderImportConfig
from kiara.models.filesystem import FolderImportConfig, KiaraFileBundle

bundle_name = self.get_config_value("result_bundle_name")
if bundle_name is None:
Expand Down Expand Up @@ -280,20 +280,36 @@ def process(self, inputs: ValueMap, outputs: ValueMap):
if attach_metadata_to_files is None:
attach_metadata_to_files = inputs.get_value_data("attach_metadata_to_files")

archive = self.retrieve_archive(inputs=inputs)
result = self.extract_archive(
archive_file=archive,
archive = self.retrieve_archive(
inputs=inputs,
bundle_name=bundle_name,
attach_metadata_to_bundle=attach_metadata_to_bundle,
attach_metadata_to_files=attach_metadata_to_files,
import_config=import_config,
)
if isinstance(archive, KiaraFileBundle):
result = archive
else:
result = self.extract_archive(
archive_file=archive,
bundle_name=bundle_name,
attach_metadata_to_bundle=attach_metadata_to_bundle,
attach_metadata_to_files=attach_metadata_to_files,
import_config=import_config,
)

outputs.set_value("file_bundle", result)

@abc.abstractmethod
def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile":
pass
def retrieve_archive(
self,
inputs: ValueMap,
bundle_name: Union[str, None],
attach_metadata_to_bundle: bool,
attach_metadata_to_files: bool,
import_config: "FolderImportConfig",
) -> Union["KiaraFile", "KiaraFileBundle"]:
"""Retrieve an archive file, or the actual result file bundle."""

def extract_archive(
self,
Expand All @@ -303,6 +319,7 @@ def extract_archive(
attach_metadata_to_files: bool,
import_config: "FolderImportConfig",
) -> "KiaraFileBundle":
"""Extract the archive file that was returned in 'retrieve_archive'."""

from kiara.models.filesystem import KiaraFileBundle

Expand Down
13 changes: 10 additions & 3 deletions src/kiara_plugin/onboarding/modules/download/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule

if TYPE_CHECKING:
from kiara.models.filesystem import KiaraFile
from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle


class DownloadFileModule(OnboardFileModule):
Expand Down Expand Up @@ -65,7 +65,14 @@ def create_onboard_inputs_schema(self) -> Dict[str, Any]:

return result

def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile":
def retrieve_archive(
self,
inputs: ValueMap,
bundle_name: Union[str, None],
attach_metadata_to_bundle: bool,
attach_metadata_to_files: bool,
import_config: "FolderImportConfig",
) -> Union["KiaraFile", "KiaraFileBundle"]:

from urllib.parse import urlparse

Expand All @@ -87,7 +94,7 @@ def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile":

kiara_file: KiaraFile

kiara_file = download_file(
kiara_file = download_file( # type: ignore
url, target=tmp_file.name, attach_metadata=True, return_md5_hash=False
)

Expand Down
13 changes: 10 additions & 3 deletions src/kiara_plugin/onboarding/modules/download/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule

if TYPE_CHECKING:
from kiara.models.filesystem import KiaraFile
from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle


class DownloadGithubFileModule(OnboardFileModule):
Expand Down Expand Up @@ -49,7 +49,7 @@ def retrieve_file(
return result_file


class DownloadFileBundleModule(OnboardFileBundleModule):
class DownloadGithbFileBundleModule(OnboardFileBundleModule):
"""Download a file bundle from a remote github repository.
If 'sub_path' is not specified, the whole repo will be used.
Expand All @@ -70,7 +70,14 @@ def create_onboard_inputs_schema(self) -> Dict[str, Any]:
}
return result

def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile":
def retrieve_archive(
self,
inputs: ValueMap,
bundle_name: Union[str, None],
attach_metadata_to_bundle: bool,
attach_metadata_to_files: bool,
import_config: "FolderImportConfig",
) -> Union["KiaraFile", "KiaraFileBundle"]:

from kiara_plugin.onboarding.utils.download import download_file

Expand Down
164 changes: 164 additions & 0 deletions src/kiara_plugin/onboarding/modules/download/zenodo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
from typing import TYPE_CHECKING, Any, Dict, Union

from kiara.exceptions import KiaraException
from kiara.models.values.value import ValueMap
from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule

if TYPE_CHECKING:
from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle


class DownloadZenodoFileModule(OnboardFileModule):
"""Download a single file from a github repo."""

_module_type_name = "download.file.from.zenodo"

def create_onboard_inputs_schema(self) -> Dict[str, Any]:

result: Dict[str, Dict[str, Any]] = {
"doi": {"type": "string", "doc": "The DOI."},
"path": {
"type": "string",
"doc": "The path to the file/file name within the dataset.",
"optional": True,
},
}
return result

def retrieve_file(
self, inputs: ValueMap, file_name: Union[str, None], attach_metadata: bool
) -> Any:

import pyzenodo3

from kiara_plugin.onboarding.utils.download import download_file

doi = inputs.get_value_data("doi")
file_path = inputs.get_value_data("path")

if "/zenodo." not in doi:
doi = f"10.5281/zenodo.{doi}"

zen = pyzenodo3.Zenodo()
record = zen.find_record_by_doi(doi)

match = None
for _available_file in record.data["files"]:
if file_path == _available_file["key"]:
match = _available_file
break

if not match:
msg = "Available files:\n"
for key in record.data["files"]:
msg += f" - {key['key']}\n"
raise KiaraException(
msg=f"Can't find file '{file_path}' in Zenodo record. {msg}"
)

url = match["links"]["self"]
checksum = match["checksum"][4:]

file_name = file_path.split("/")[-1]

result_file: KiaraFile
result_file, result_checksum = download_file( # type: ignore
url=url,
file_name=file_name,
attach_metadata=attach_metadata,
return_md5_hash=True,
)

if checksum != result_checksum:
raise KiaraException(
msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {checksum}"
)

if attach_metadata:
result_file.metadata["zenodo_record_data"] = record.data

return result_file


class DownloadZenodoFileBundleModule(OnboardFileBundleModule):
"""Download a file bundle from a remote github repository.
If 'sub_path' is not specified, the whole repo will be used.
"""

_module_type_name = "download.file_bundle.from.zenodo"

def create_onboard_inputs_schema(self) -> Dict[str, Any]:
result: Dict[str, Dict[str, Any]] = {
"doi": {"type": "string", "doc": "The DOI."},
}
return result

def retrieve_archive(
self,
inputs: ValueMap,
bundle_name: Union[str, None],
attach_metadata_to_bundle: bool,
attach_metadata_to_files: bool,
import_config: "FolderImportConfig",
) -> Union["KiaraFile", "KiaraFileBundle"]:

import pyzenodo3

from kiara.models.filesystem import KiaraFile, KiaraFileBundle
from kiara_plugin.onboarding.utils.download import download_file

doi = inputs.get_value_data("doi")

if "/zenodo." not in doi:
doi = f"10.5281/zenodo.{doi}"

zen = pyzenodo3.Zenodo()
record = zen.find_record_by_doi(doi)

base_path = KiaraFileBundle.create_tmp_dir()

for _available_file in record.data["files"]:
match = _available_file

url = match["links"]["self"]
checksum = match["checksum"][4:]

file_path = _available_file["key"]
full_path = base_path / file_path

file_name = file_path.split("/")[-1]

# TODO: filter here already, so we don't need to download files we don't want

result_file: KiaraFile
result_file, result_checksum = download_file( # type: ignore
url=url,
target=full_path.as_posix(),
file_name=file_name,
attach_metadata=True,
return_md5_hash=True,
)

if checksum != result_checksum:
raise KiaraException(
msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {result_checksum}"
)

if not bundle_name:
bundle_name = doi
result = KiaraFileBundle.import_folder(
source=base_path.as_posix(),
bundle_name=bundle_name,
import_config=import_config,
)
if attach_metadata_to_bundle:
result.metadata["zenodo_record_data"] = record.data

if attach_metadata_to_files:
for file in result.included_files.values():
file.metadata["zenodo_record_data"] = record.data

return result

0 comments on commit f14db56

Please sign in to comment.