Merge remote-tracking branch 'origin/master' into gh-85

* origin/master: 0.8.2 release (+ignore __testhome__ RF: Replace custom SafeConfigParserWithIncludes with standard ConfigParser BF: gh - handle situation when cloned repo is still empty DOC: forgotten update to changelog.rst 0.8.1 release changelog and version boost BF: fixup for use of private datalad's functionality for github auth minimal Changelog for 0.8.0 RF: Discontinue use of feature exclusive to the old DataLad Runner TST: openfmri: Handle get() exception raised by upcoming datalad TST: travis: Upgrade pip before installing datalad 0.7.0 release changes (changelog etc) RF: use .drop instead of protected repo._annex_custom_command RF: Discontinue use of GitRepo._git_custom_command() Conflicts: CHANGELOG.md - included recent ones and remove leading '#' for them too
datalad · Apr 13, 2021 · 04cda0b · 04cda0b
2 parents 7b566ad + d8da5e6
commit 04cda0b
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 71 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@ tmp
 .*.swp
 .travis.yml.evil-bd
 .asv
+__testhome__
diff --git a/.travis.yml b/.travis.yml
@@ -38,6 +38,7 @@ before_install:
   # install git-annex with the relevant bits
   # no recommends to avoid inheriting the entire multimedia stack
   - travis_retry sudo eatmydata apt-get install --no-install-recommends git-annex-standalone aria2 git-remote-gcrypt lsof gnupg nocache p7zip-full
+  - pip install --upgrade pip
 
 install:
   # Install standalone build of git-annex for the recent enough version

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,23 @@
+# 0.8.2 (Mar 19, 2021) -- Hunt the corpses
+
+- RF: Replace custom SafeConfigParserWithIncludes with standard ConfigParser
+- BF: gh - handle situation when cloned repo is still empty
+
+# 0.8.1 (Feb 25, 2021) -- Pay the price
+
+- Fix up use of protected datalad's interface for auth to github.
+  Boosted DataLad version dependency to 0.13.6
+
+# 0.8.0 (Jan 03, 2021) -- Good as New
+
+- Making compatible with recent DataLad by using new WitlessRunner and
+  not older unused features.
+
+# 0.7.0 (Nov 20, 2020) -- Cherish the moment
+
+- RF: stop using `_{git,annex}_custom_command` to allow DataLad core
+  progress forward without "breaking" the crawler
+
 # 0.6.0 (Jul 13, 2020) -- Honoring Kyle (who never adds a release "name")
 
 - ENH: fix enabling special remotes when working ith recent (as of 202006)

diff --git a/datalad_crawler/base.py b/datalad_crawler/base.py
@@ -11,14 +11,3 @@
 """
 
 __docformat__ = 'restructuredtext'
-
-from datalad import cfg
-from datalad.cmd import Runner
-from datalad.support.protocol import DryRunProtocol
-
-
-def get_runner(*args, **kwargs):
-    if cfg.obtain('datalad.crawl.dryrun', default=False):
-        kwargs = kwargs.copy()
-        kwargs['protocol'] = DryRunProtocol()
-    return Runner(*args, **kwargs)
diff --git a/datalad_crawler/nodes/annex.py b/datalad_crawler/nodes/annex.py
@@ -47,7 +47,6 @@
 
 from datalad import cfg
 
-from datalad_crawler.base import get_runner
 from datalad_crawler.pipeline import initiate_pipeline_config
 from datalad_crawler.dbs.files import PhysicalFileStatusesDB
 from datalad_crawler.dbs.files import JsonFileStatusesDB
@@ -59,10 +58,6 @@
 
 lgr = getLogger('datalad.crawl.annex')
 
-_runner = get_runner()
-_call = _runner.call
-_run = _runner.run
-
 
 # TODO: make use of datalad_stats
 @auto_repr
@@ -229,15 +224,15 @@ def __call__(self, data={}):
             elif existing == 'raise':
                 raise RuntimeError("%s already exists" % dataset_path)
             elif existing == 'replace':
-                _call(rmtree, dataset_path)
+                rmtree(dataset_path)
             elif existing == 'adjust':
                 # E.g. just regenerate configs/meta
                 init = False
             else:  # TODO: 'crawl'  ;)
                 raise ValueError(self.existing)
         if init:
-            _call(self._initiate_dataset, dataset_path, dataset_name)
-        _call(self._save_crawl_config, dataset_path, data)
+            self._initiate_dataset(dataset_path, dataset_name)
+        self._save_crawl_config(dataset_path, data)
 
         yield data_updated
 
@@ -500,7 +495,7 @@ def __call__(self, data):  # filename=None, get_disposition_filename=False):
             assert fpath
             # just add into git directly for now
             # TODO: tune  add so we could use its json output, and may be even batch it
-            out_json = _call(self.repo.add, fpath, options=self.options)
+            out_json = self.repo.add(fpath, options=self.options)
         # elif self.mode == 'full':
         #     # since addurl ignores annex.largefiles we need first to download that file and then
         #     # annex add it
@@ -536,22 +531,22 @@ def __call__(self, data):  # filename=None, get_disposition_filename=False):
                 if isdir(filepath):
                     # if directory - tricky, since we would want then to check if no
                     # staged changes under
-                    _call(self._check_no_staged_changes_under_dir, filepath, stats=stats)
-                    _call(rmtree, filepath)
+                    self._check_no_staged_changes_under_dir(filepath, stats=stats)
+                    rmtree(filepath)
                 else:
-                    _call(unlink, filepath)
-                _call(stats.increment, 'overwritten')
+                    unlink(filepath)
+                stats.increment('overwritten')
             else:
-                _call(self._check_non_existing_filepath, filepath, stats=stats)
+                self._check_non_existing_filepath(filepath, stats=stats)
             # TODO: We need to implement our special remote here since no downloaders used
             if self.mode == 'full' and url_status and url_status.size:  # > 1024**2:
                 lgr.info("Need to download %s from %s. No progress indication will be reported"
                          % (naturalsize(url_status.size), url))
             try:
                 out_json = try_multiple(
                     6, AnnexBatchCommandError, 3,  # up to 3**5=243 sec sleep
-                    _call,
-                    self.repo.add_url_to_file, fpath, url,
+                    self.repo.add_url_to_file,
+                    fpath, url,
                     options=annex_options, batch=self.batch_add)
             except AnnexBatchCommandError as exc:
                 if self.skip_problematic:
@@ -564,22 +559,22 @@ def __call__(self, data):  # filename=None, get_disposition_filename=False):
             if self.mode == 'full' or not added_to_annex:
                 # we need to adjust our download stats since addurl doesn't do that and we do
                 # not use our downloaders here
-                _call(stats.increment, 'downloaded')
-                _call(stats.increment, 'downloaded_size', _call(lambda: os.stat(filepath).st_size))
+                stats.increment('downloaded')
+                stats.increment('downloaded_size', os.stat(filepath).st_size)
 
         # file might have been added but really not changed anything (e.g. the same README was generated)
         # TODO:
         # if out_json:  # if not try -- should be here!
         # File might have been not modified at all, so let's check its status first
         changed = set().union(*self._get_status(args=[fpath]))
         if fpath in changed:
-            _call(stats.increment,
+            stats.increment(
                   'add_annex'
                     if ('key' in out_json and out_json['key'] is not None)
                     else 'add_git'
-                  )
+            )
         else:
-            _call(stats.increment, 'skipped')
+            stats.increment('skipped')
 
         # TODO!!:  sanity check that no large files are added to git directly!
 
@@ -592,14 +587,14 @@ def __call__(self, data):  # filename=None, get_disposition_filename=False):
                 # _call(os.utime, filepath, (time.time(), remote_status.mtime))
                 # *nix only!  TODO
                 if url_status.mtime:
-                    _call(lmtime, filepath, url_status.mtime)
+                    lmtime(filepath, url_status.mtime)
                 if statusdb:
-                    _call(statusdb.set, filepath, url_status)
+                    statusdb.set(filepath, url_status)
             else:
                 # we still need to inform DB about this file so later it would signal to remove it
                 # if we no longer care about it
                 if statusdb:
-                    _call(statusdb.set, filepath)
+                    statusdb.set(filepath)
 
         self._states.add("Updated git/annex from a remote location")
 
@@ -625,7 +620,7 @@ def _check_no_staged_changes_under_dir(self, dirpath, stats=None):
         dirpath_normalized = _normalize_path(self.repo.path, dirpath)
         for dirty_file in dirty_files:
             if stats:
-                _call(stats.increment, 'removed')
+                stats.increment('removed')
             if dirty_file.startswith(dirpath_normalized):
                 if self.auto_finalize:
                     self.finalize()({'datalad_stats': stats})
@@ -667,9 +662,9 @@ def _check_non_existing_filepath(self, filepath, stats=None):
                                     "within git.  Please commit or remove it before trying "
                                     "to annex this new file" % locals())
                         lgr.debug("Removing %s as it is in the path of %s" % (dirpath, filepath))
-                        _call(os.unlink, dirpath)
+                        os.unlink(dirpath)
                         if stats:
-                            _call(stats.increment, 'overwritten')
+                            stats.increment('overwritten')
                     break  # in any case -- we are done!
 
                 dirpath, _ = ops(dirpath)
@@ -857,7 +852,7 @@ def merge_branch(data):
                 elif strategy == 'theirs':
                     self.repo.merge(to_merge, options=["-s", "ours", "--no-commit"],
                                     expect_stderr=True, **merge_kwargs)
-                    self.repo._git_custom_command([], "git read-tree -m -u %s" % to_merge)
+                    self.repo.call_git(["read-tree", "-m", "-u", to_merge])
                     self.repo.add('.', options=self.options)  # so everything is staged to be committed
                 else:
                     raise NotImplementedError(strategy)
@@ -905,14 +900,13 @@ def _commit(self, msg=None, options=[]):
         if msg is not None:
             options = options + ["-m", msg]
         self._precommit()  # so that all batched annexes stop
-        self.repo._git_custom_command([], ["git", "commit"] + options,
-                                      check_fake_dates=True)
+        self.repo.call_git(["commit"] + options)
         # self.repo.commit(msg)
         # self.repo.repo.git.commit(options)
 
     def _unstage(self, fpaths):
         # self.repo.cmd_call_wrapper.run(["git", "reset"] + fpaths)
-        self.repo._git_custom_command(fpaths, ["git", "reset"])
+        self.repo.call_git(["reset"], files=fpaths)
 
     def _stage(self, fpaths):
         self.repo.add(fpaths, git=True)
@@ -925,7 +919,7 @@ def _get_status(self, args=[]):
         is resolved
         """
         # out, err = self.repo.cmd_call_wrapper.run(["git", "status", "--porcelain"])
-        cmd_args = ["git", "status", "--porcelain"] + args
+        cmd_args = ["status", "--porcelain"] + args
         staged, notstaged, untracked, deleted = [], [], [], []
         statuses = {
             '??': untracked,
@@ -939,12 +933,7 @@ def _get_status(self, args=[]):
                                      # TODO: handle "properly" by committing before D happens
         }
 
-        if isinstance(self.repo, AnnexRepo) and self.repo.is_direct_mode():
-            statuses['AD'] = staged
-            out, err = self.repo.proxy(cmd_args)
-        else:
-            out, err = self.repo._git_custom_command([], cmd_args)
-            assert not err
+        out = self.repo.call_git(cmd_args)
 
         for l in out.split('\n'):
             if not l:
@@ -1066,7 +1055,7 @@ def _commit_versions(data):
                 # if a single new version -- no special treatment is needed, but we need to
                 # inform db about this new version
                 if nnew_versions == 1:
-                    _call(setattr, versions_db, 'version', smallest_new_version)
+                    setattr(versions_db, 'version', smallest_new_version)
                 # we can't return a generator here
                 for d in self.finalize()(data):
                     yield d
@@ -1078,7 +1067,7 @@ def _commit_versions(data):
                 nfpaths = len(fpaths)
                 lgr.debug("Unstaging %d files for version %s", nfpaths, version)
                 nunstaged += nfpaths
-                _call(self._unstage, list(fpaths.values()))
+                self._unstage(list(fpaths.values()))
 
             stats = data.get('datalad_stats', None)
             stats_str = ('\n\n' + stats.as_str(mode='full')) if stats else ''
@@ -1104,19 +1093,19 @@ def _commit_versions(data):
                 nunstaged -= nfpaths
                 assert (nfpaths >= 0)
                 assert (nunstaged >= 0)
-                _call(self._stage, vfpaths)
+                self._stage(vfpaths)
 
                 # RF: with .finalize() to avoid code duplication etc
                 # ??? what to do about stats and states?  reset them or somehow tune/figure it out?
                 vmsg = "Multi-version commit #%d/%d: %s. Remaining unstaged: %d" % (
                 iversion + 1, nnew_versions, version, nunstaged)
 
                 if stats:
-                    _call(stats.reset)
+                    stats.reset()
 
                 if version:
-                    _call(setattr, versions_db, 'version', version)
-                _call(self._commit, "%s (%s)%s" % (', '.join(self._states), vmsg, stats_str), options=[])
+                    setattr(versions_db, 'version', version)
+                self._commit("%s (%s)%s" % (', '.join(self._states), vmsg, stats_str), options=[])
                 # unless we update data, no need to yield multiple times I guess
                 # but shouldn't hurt
                 yield data
@@ -1336,12 +1325,12 @@ def _finalize(data):
             stats = data.get('datalad_stats', None)
             if self.repo.dirty:  # or self.tracker.dirty # for dry run
                 lgr.info("Repository found dirty -- adding and committing")
-                _call(self.repo.add, '.', options=self.options)  # so everything is committed
+                self.repo.add('.', options=self.options)  # so everything is committed
 
                 stats_str = ('\n\n' + stats.as_str(mode='full')) if stats else ''
-                _call(self._commit, "%s%s" % (', '.join(self._states), stats_str), options=["-a"])
+                self._commit("%s%s" % (', '.join(self._states), stats_str), options=["-a"])
                 if stats:
-                    _call(stats.reset)
+                    stats.reset()
             else:
                 lgr.info("Found branch non-dirty -- nothing was committed")
 
@@ -1419,9 +1408,9 @@ def __call__(self_, data):
                     files_str = ": " + ', '.join(obsolete) if len(obsolete) < 10 else ""
                     lgr.info('Removing %d obsolete files%s' % (len(obsolete), files_str))
                     stats = data.get('datalad_stats', None)
-                    _call(self.repo.remove, obsolete)
+                    self.repo.remove(obsolete)
                     if stats:
-                        _call(stats.increment, 'removed', len(obsolete))
+                        stats.increment('removed', len(obsolete))
                     for filepath in obsolete:
                         statusdb.remove(filepath)
                 yield data
@@ -1439,16 +1428,16 @@ def remove(self, data, recursive=False):
         filename = self._get_fpath(data, stats)
         # TODO: not sure if we should may be check if exists, and skip/just complain if not
         if stats:
-            _call(stats.increment, 'removed')
+            stats.increment('removed')
         filepath = opj(self.repo.path, filename)
         if lexists(filepath):
             if os.path.isdir(filepath):
                 if recursive:
-                    _call(self.repo.remove, filename, recursive=True)
+                    self.repo.remove(filename, recursive=True)
                 else:
                     lgr.warning("Do not removing %s recursively, skipping", filepath)
             else:
-                _call(self.repo.remove, filename)
+                self.repo.remove(filename)
         else:
             lgr.warning("Was asked to remove non-existing path %s", filename)
         yield data

diff --git a/datalad_crawler/pipeline.py b/datalad_crawler/pipeline.py
@@ -62,7 +62,7 @@
 from datalad.support.network import parse_url_opts
 from datalad.support.stats import ActivityStats
 from datalad.support.exceptions import PipelineNotSpecifiedError
-from datalad.support.configparserinc import SafeConfigParserWithIncludes
+from configparser import ConfigParser
 
 from logging import getLogger
 lgr = getLogger('datalad.crawler.pipeline')
@@ -338,7 +338,7 @@ def initiate_pipeline_config(template, template_func=None, template_kwargs=None,
 
     crawl_config_repo_path = opj(CRAWLER_META_DIR, CRAWLER_META_CONFIG_FILENAME)
     crawl_config = opj(crawl_config_dir, CRAWLER_META_CONFIG_FILENAME)
-    cfg_ = SafeConfigParserWithIncludes()
+    cfg_ = ConfigParser()
     cfg_.add_section(CRAWLER_PIPELINE_SECTION)
 
     cfg_.set(CRAWLER_PIPELINE_SECTION, 'template', template)
@@ -497,7 +497,7 @@ def load_pipeline_from_config(path):
     so that theoretically we could specify basic pipelines completely within
     a URL
     """
-    cfg_ = SafeConfigParserWithIncludes()
+    cfg_ = ConfigParser()
     cfg_.read([path])
     pipeline = None
     for sec in (CRAWLER_PIPELINE_SECTION, CRAWLER_PIPELINE_SECTION_DEPRECATED):
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,4 @@ tmp @@
     .*.swp
     .travis.yml.evil-bd
     .asv
+    __testhome__