From 34747aaa056473021c3af37f2b537ea7403b7340 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 3 Aug 2022 19:43:09 -0400 Subject: [PATCH 1/3] RF: remove datalad_deprecated dependency Was introduced in https://github.com/datalad/datalad-crawler/pull/94 with only hypothetical assumptions of the necessity. We are now hypothesizing that it was not needed --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 38f092c..cde751d 100755 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ def findsome(subdir, extensions): requires = { 'core': [ 'datalad>=0.14.0', - 'datalad_deprecated', 'scrapy>=1.1.0', # versioning is primarily for python3 support ], 'devel-docs': [ From dfa39af6eca650835cfd831604ea2e680c070b3e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 4 Aug 2022 09:14:00 -0400 Subject: [PATCH 2/3] XNAT pipeline -- remove unused imports (borrowed from openfmri) and do minor linting --- datalad_crawler/pipelines/xnat.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/datalad_crawler/pipelines/xnat.py b/datalad_crawler/pipelines/xnat.py index 35e9624..f193e0c 100644 --- a/datalad_crawler/pipelines/xnat.py +++ b/datalad_crawler/pipelines/xnat.py @@ -16,32 +16,10 @@ it is """ -import os -import re import json -from os.path import lexists - -# Import necessary nodes -from ..nodes.crawl_url import crawl_url -from ..nodes.matches import css_match, a_href_match from ..nodes.misc import assign -from ..nodes.misc import sub -from ..nodes.misc import switch -from ..nodes.misc import func_to_node -from ..nodes.misc import find_files -from ..nodes.misc import skip_if -from ..nodes.misc import debug -from ..nodes.misc import fix_permissions from ..nodes.annex import Annexificator from datalad.utils import updated -from datalad.consts import ARCHIVES_SPECIAL_REMOTE, DATALAD_SPECIAL_REMOTE -from datalad.downloaders.providers import Providers - -# For S3 crawling -from ..nodes.s3 import crawl_s3 -from .openfmri_s3 import pipeline as s3_pipeline -from datalad.api import ls -from datalad.dochelpers import exc_str # Possibly instantiate a logger if you would like to log # during pipeline creation @@ -51,9 +29,11 @@ from datalad.tests.utils_pytest import eq_ from datalad.utils import assure_list, assure_bool + def list_to_dict(l, field): return {r.pop(field): r for r in l} + DEFAULT_RESULT_FIELDS = {'totalrecords', 'result'} PROJECT_ACCESS_TYPES = {'public', 'protected', 'private'} @@ -144,7 +124,7 @@ def __call__(self, query, def get_projects(self, limit=None, drop_empty=True, asdict=True): """Get list of projects - + Parameters ---------- limit: {'public', 'protected', 'private', None} or list of thereof From d444b1931e5e01b3b0b459dc303f60201397d54e Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 4 Aug 2022 09:29:58 -0400 Subject: [PATCH 3/3] RF: remove ls dependency by checking openfmri bucket directly All of that is not really needed any more since openneuro is "no more" needed to be crawled - uses git-annex natively --- datalad_crawler/pipelines/openfmri.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datalad_crawler/pipelines/openfmri.py b/datalad_crawler/pipelines/openfmri.py index 48daf34..af2ce24 100644 --- a/datalad_crawler/pipelines/openfmri.py +++ b/datalad_crawler/pipelines/openfmri.py @@ -11,6 +11,8 @@ import os from os.path import lexists +from datalad.downloaders.s3 import S3Authenticator + # Import necessary nodes from ..nodes.crawl_url import crawl_url from ..nodes.matches import a_href_match @@ -26,7 +28,6 @@ # For S3 crawling from .openfmri_s3 import pipeline as s3_pipeline -from datalad.api import ls from datalad.dochelpers import exc_str # Possibly instantiate a logger if you would like to log @@ -123,9 +124,9 @@ def pipeline(dataset, # assert suf in 'AB' # s3_prefix = 'ds017' + suf - openfmri_s3_prefix = 's3://openneuro/' try: - if not ls('%s%s' % (openfmri_s3_prefix, s3_prefix)): + bucket = S3Authenticator().authenticate("openneuro", None) + if not next(iter(bucket.list(s3_prefix, "/"))): s3_prefix = None # not there except Exception as exc: lgr.warning(