Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into gh-85
Browse files Browse the repository at this point in the history
* origin/master:
  0.8.2 release (+ignore __testhome__
  RF: Replace custom SafeConfigParserWithIncludes with standard ConfigParser
  BF: gh - handle situation when cloned repo is still empty
  DOC: forgotten update to changelog.rst
  0.8.1 release changelog and version boost
  BF: fixup for use of private datalad's functionality for github auth
  minimal Changelog for 0.8.0
  RF: Discontinue use of feature exclusive to the old DataLad Runner
  TST: openfmri: Handle get() exception raised by upcoming datalad
  TST: travis: Upgrade pip before installing datalad
  0.7.0 release changes (changelog etc)
  RF: use .drop instead of protected repo._annex_custom_command
  RF: Discontinue use of GitRepo._git_custom_command()

 Conflicts:
	CHANGELOG.md - included recent ones and remove leading '#' for them too
  • Loading branch information
yarikoptic committed Apr 13, 2021
2 parents 7b566ad + d8da5e6 commit 04cda0b
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 71 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ tmp
.*.swp
.travis.yml.evil-bd
.asv
__testhome__
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ before_install:
# install git-annex with the relevant bits
# no recommends to avoid inheriting the entire multimedia stack
- travis_retry sudo eatmydata apt-get install --no-install-recommends git-annex-standalone aria2 git-remote-gcrypt lsof gnupg nocache p7zip-full
- pip install --upgrade pip

install:
# Install standalone build of git-annex for the recent enough version
Expand Down
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
# 0.8.2 (Mar 19, 2021) -- Hunt the corpses

- RF: Replace custom SafeConfigParserWithIncludes with standard ConfigParser
- BF: gh - handle situation when cloned repo is still empty

# 0.8.1 (Feb 25, 2021) -- Pay the price

- Fix up use of protected datalad's interface for auth to github.
Boosted DataLad version dependency to 0.13.6

# 0.8.0 (Jan 03, 2021) -- Good as New

- Making compatible with recent DataLad by using new WitlessRunner and
not older unused features.

# 0.7.0 (Nov 20, 2020) -- Cherish the moment

- RF: stop using `_{git,annex}_custom_command` to allow DataLad core
progress forward without "breaking" the crawler

# 0.6.0 (Jul 13, 2020) -- Honoring Kyle (who never adds a release "name")

- ENH: fix enabling special remotes when working ith recent (as of 202006)
Expand Down
11 changes: 0 additions & 11 deletions datalad_crawler/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,3 @@
"""

__docformat__ = 'restructuredtext'

from datalad import cfg
from datalad.cmd import Runner
from datalad.support.protocol import DryRunProtocol


def get_runner(*args, **kwargs):
if cfg.obtain('datalad.crawl.dryrun', default=False):
kwargs = kwargs.copy()
kwargs['protocol'] = DryRunProtocol()
return Runner(*args, **kwargs)
93 changes: 41 additions & 52 deletions datalad_crawler/nodes/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@

from datalad import cfg

from datalad_crawler.base import get_runner
from datalad_crawler.pipeline import initiate_pipeline_config
from datalad_crawler.dbs.files import PhysicalFileStatusesDB
from datalad_crawler.dbs.files import JsonFileStatusesDB
Expand All @@ -59,10 +58,6 @@

lgr = getLogger('datalad.crawl.annex')

_runner = get_runner()
_call = _runner.call
_run = _runner.run


# TODO: make use of datalad_stats
@auto_repr
Expand Down Expand Up @@ -229,15 +224,15 @@ def __call__(self, data={}):
elif existing == 'raise':
raise RuntimeError("%s already exists" % dataset_path)
elif existing == 'replace':
_call(rmtree, dataset_path)
rmtree(dataset_path)
elif existing == 'adjust':
# E.g. just regenerate configs/meta
init = False
else: # TODO: 'crawl' ;)
raise ValueError(self.existing)
if init:
_call(self._initiate_dataset, dataset_path, dataset_name)
_call(self._save_crawl_config, dataset_path, data)
self._initiate_dataset(dataset_path, dataset_name)
self._save_crawl_config(dataset_path, data)

yield data_updated

Expand Down Expand Up @@ -500,7 +495,7 @@ def __call__(self, data): # filename=None, get_disposition_filename=False):
assert fpath
# just add into git directly for now
# TODO: tune add so we could use its json output, and may be even batch it
out_json = _call(self.repo.add, fpath, options=self.options)
out_json = self.repo.add(fpath, options=self.options)
# elif self.mode == 'full':
# # since addurl ignores annex.largefiles we need first to download that file and then
# # annex add it
Expand Down Expand Up @@ -536,22 +531,22 @@ def __call__(self, data): # filename=None, get_disposition_filename=False):
if isdir(filepath):
# if directory - tricky, since we would want then to check if no
# staged changes under
_call(self._check_no_staged_changes_under_dir, filepath, stats=stats)
_call(rmtree, filepath)
self._check_no_staged_changes_under_dir(filepath, stats=stats)
rmtree(filepath)
else:
_call(unlink, filepath)
_call(stats.increment, 'overwritten')
unlink(filepath)
stats.increment('overwritten')
else:
_call(self._check_non_existing_filepath, filepath, stats=stats)
self._check_non_existing_filepath(filepath, stats=stats)
# TODO: We need to implement our special remote here since no downloaders used
if self.mode == 'full' and url_status and url_status.size: # > 1024**2:
lgr.info("Need to download %s from %s. No progress indication will be reported"
% (naturalsize(url_status.size), url))
try:
out_json = try_multiple(
6, AnnexBatchCommandError, 3, # up to 3**5=243 sec sleep
_call,
self.repo.add_url_to_file, fpath, url,
self.repo.add_url_to_file,
fpath, url,
options=annex_options, batch=self.batch_add)
except AnnexBatchCommandError as exc:
if self.skip_problematic:
Expand All @@ -564,22 +559,22 @@ def __call__(self, data): # filename=None, get_disposition_filename=False):
if self.mode == 'full' or not added_to_annex:
# we need to adjust our download stats since addurl doesn't do that and we do
# not use our downloaders here
_call(stats.increment, 'downloaded')
_call(stats.increment, 'downloaded_size', _call(lambda: os.stat(filepath).st_size))
stats.increment('downloaded')
stats.increment('downloaded_size', os.stat(filepath).st_size)

# file might have been added but really not changed anything (e.g. the same README was generated)
# TODO:
# if out_json: # if not try -- should be here!
# File might have been not modified at all, so let's check its status first
changed = set().union(*self._get_status(args=[fpath]))
if fpath in changed:
_call(stats.increment,
stats.increment(
'add_annex'
if ('key' in out_json and out_json['key'] is not None)
else 'add_git'
)
)
else:
_call(stats.increment, 'skipped')
stats.increment('skipped')

# TODO!!: sanity check that no large files are added to git directly!

Expand All @@ -592,14 +587,14 @@ def __call__(self, data): # filename=None, get_disposition_filename=False):
# _call(os.utime, filepath, (time.time(), remote_status.mtime))
# *nix only! TODO
if url_status.mtime:
_call(lmtime, filepath, url_status.mtime)
lmtime(filepath, url_status.mtime)
if statusdb:
_call(statusdb.set, filepath, url_status)
statusdb.set(filepath, url_status)
else:
# we still need to inform DB about this file so later it would signal to remove it
# if we no longer care about it
if statusdb:
_call(statusdb.set, filepath)
statusdb.set(filepath)

self._states.add("Updated git/annex from a remote location")

Expand All @@ -625,7 +620,7 @@ def _check_no_staged_changes_under_dir(self, dirpath, stats=None):
dirpath_normalized = _normalize_path(self.repo.path, dirpath)
for dirty_file in dirty_files:
if stats:
_call(stats.increment, 'removed')
stats.increment('removed')
if dirty_file.startswith(dirpath_normalized):
if self.auto_finalize:
self.finalize()({'datalad_stats': stats})
Expand Down Expand Up @@ -667,9 +662,9 @@ def _check_non_existing_filepath(self, filepath, stats=None):
"within git. Please commit or remove it before trying "
"to annex this new file" % locals())
lgr.debug("Removing %s as it is in the path of %s" % (dirpath, filepath))
_call(os.unlink, dirpath)
os.unlink(dirpath)
if stats:
_call(stats.increment, 'overwritten')
stats.increment('overwritten')
break # in any case -- we are done!

dirpath, _ = ops(dirpath)
Expand Down Expand Up @@ -857,7 +852,7 @@ def merge_branch(data):
elif strategy == 'theirs':
self.repo.merge(to_merge, options=["-s", "ours", "--no-commit"],
expect_stderr=True, **merge_kwargs)
self.repo._git_custom_command([], "git read-tree -m -u %s" % to_merge)
self.repo.call_git(["read-tree", "-m", "-u", to_merge])
self.repo.add('.', options=self.options) # so everything is staged to be committed
else:
raise NotImplementedError(strategy)
Expand Down Expand Up @@ -905,14 +900,13 @@ def _commit(self, msg=None, options=[]):
if msg is not None:
options = options + ["-m", msg]
self._precommit() # so that all batched annexes stop
self.repo._git_custom_command([], ["git", "commit"] + options,
check_fake_dates=True)
self.repo.call_git(["commit"] + options)
# self.repo.commit(msg)
# self.repo.repo.git.commit(options)

def _unstage(self, fpaths):
# self.repo.cmd_call_wrapper.run(["git", "reset"] + fpaths)
self.repo._git_custom_command(fpaths, ["git", "reset"])
self.repo.call_git(["reset"], files=fpaths)

def _stage(self, fpaths):
self.repo.add(fpaths, git=True)
Expand All @@ -925,7 +919,7 @@ def _get_status(self, args=[]):
is resolved
"""
# out, err = self.repo.cmd_call_wrapper.run(["git", "status", "--porcelain"])
cmd_args = ["git", "status", "--porcelain"] + args
cmd_args = ["status", "--porcelain"] + args
staged, notstaged, untracked, deleted = [], [], [], []
statuses = {
'??': untracked,
Expand All @@ -939,12 +933,7 @@ def _get_status(self, args=[]):
# TODO: handle "properly" by committing before D happens
}

if isinstance(self.repo, AnnexRepo) and self.repo.is_direct_mode():
statuses['AD'] = staged
out, err = self.repo.proxy(cmd_args)
else:
out, err = self.repo._git_custom_command([], cmd_args)
assert not err
out = self.repo.call_git(cmd_args)

for l in out.split('\n'):
if not l:
Expand Down Expand Up @@ -1066,7 +1055,7 @@ def _commit_versions(data):
# if a single new version -- no special treatment is needed, but we need to
# inform db about this new version
if nnew_versions == 1:
_call(setattr, versions_db, 'version', smallest_new_version)
setattr(versions_db, 'version', smallest_new_version)
# we can't return a generator here
for d in self.finalize()(data):
yield d
Expand All @@ -1078,7 +1067,7 @@ def _commit_versions(data):
nfpaths = len(fpaths)
lgr.debug("Unstaging %d files for version %s", nfpaths, version)
nunstaged += nfpaths
_call(self._unstage, list(fpaths.values()))
self._unstage(list(fpaths.values()))

stats = data.get('datalad_stats', None)
stats_str = ('\n\n' + stats.as_str(mode='full')) if stats else ''
Expand All @@ -1104,19 +1093,19 @@ def _commit_versions(data):
nunstaged -= nfpaths
assert (nfpaths >= 0)
assert (nunstaged >= 0)
_call(self._stage, vfpaths)
self._stage(vfpaths)

# RF: with .finalize() to avoid code duplication etc
# ??? what to do about stats and states? reset them or somehow tune/figure it out?
vmsg = "Multi-version commit #%d/%d: %s. Remaining unstaged: %d" % (
iversion + 1, nnew_versions, version, nunstaged)

if stats:
_call(stats.reset)
stats.reset()

if version:
_call(setattr, versions_db, 'version', version)
_call(self._commit, "%s (%s)%s" % (', '.join(self._states), vmsg, stats_str), options=[])
setattr(versions_db, 'version', version)
self._commit("%s (%s)%s" % (', '.join(self._states), vmsg, stats_str), options=[])
# unless we update data, no need to yield multiple times I guess
# but shouldn't hurt
yield data
Expand Down Expand Up @@ -1336,12 +1325,12 @@ def _finalize(data):
stats = data.get('datalad_stats', None)
if self.repo.dirty: # or self.tracker.dirty # for dry run
lgr.info("Repository found dirty -- adding and committing")
_call(self.repo.add, '.', options=self.options) # so everything is committed
self.repo.add('.', options=self.options) # so everything is committed

stats_str = ('\n\n' + stats.as_str(mode='full')) if stats else ''
_call(self._commit, "%s%s" % (', '.join(self._states), stats_str), options=["-a"])
self._commit("%s%s" % (', '.join(self._states), stats_str), options=["-a"])
if stats:
_call(stats.reset)
stats.reset()
else:
lgr.info("Found branch non-dirty -- nothing was committed")

Expand Down Expand Up @@ -1419,9 +1408,9 @@ def __call__(self_, data):
files_str = ": " + ', '.join(obsolete) if len(obsolete) < 10 else ""
lgr.info('Removing %d obsolete files%s' % (len(obsolete), files_str))
stats = data.get('datalad_stats', None)
_call(self.repo.remove, obsolete)
self.repo.remove(obsolete)
if stats:
_call(stats.increment, 'removed', len(obsolete))
stats.increment('removed', len(obsolete))
for filepath in obsolete:
statusdb.remove(filepath)
yield data
Expand All @@ -1439,16 +1428,16 @@ def remove(self, data, recursive=False):
filename = self._get_fpath(data, stats)
# TODO: not sure if we should may be check if exists, and skip/just complain if not
if stats:
_call(stats.increment, 'removed')
stats.increment('removed')
filepath = opj(self.repo.path, filename)
if lexists(filepath):
if os.path.isdir(filepath):
if recursive:
_call(self.repo.remove, filename, recursive=True)
self.repo.remove(filename, recursive=True)
else:
lgr.warning("Do not removing %s recursively, skipping", filepath)
else:
_call(self.repo.remove, filename)
self.repo.remove(filename)
else:
lgr.warning("Was asked to remove non-existing path %s", filename)
yield data
Expand Down
6 changes: 3 additions & 3 deletions datalad_crawler/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from datalad.support.network import parse_url_opts
from datalad.support.stats import ActivityStats
from datalad.support.exceptions import PipelineNotSpecifiedError
from datalad.support.configparserinc import SafeConfigParserWithIncludes
from configparser import ConfigParser

from logging import getLogger
lgr = getLogger('datalad.crawler.pipeline')
Expand Down Expand Up @@ -338,7 +338,7 @@ def initiate_pipeline_config(template, template_func=None, template_kwargs=None,

crawl_config_repo_path = opj(CRAWLER_META_DIR, CRAWLER_META_CONFIG_FILENAME)
crawl_config = opj(crawl_config_dir, CRAWLER_META_CONFIG_FILENAME)
cfg_ = SafeConfigParserWithIncludes()
cfg_ = ConfigParser()
cfg_.add_section(CRAWLER_PIPELINE_SECTION)

cfg_.set(CRAWLER_PIPELINE_SECTION, 'template', template)
Expand Down Expand Up @@ -497,7 +497,7 @@ def load_pipeline_from_config(path):
so that theoretically we could specify basic pipelines completely within
a URL
"""
cfg_ = SafeConfigParserWithIncludes()
cfg_ = ConfigParser()
cfg_.read([path])
pipeline = None
for sec in (CRAWLER_PIPELINE_SECTION, CRAWLER_PIPELINE_SECTION_DEPRECATED):
Expand Down
Loading

0 comments on commit 04cda0b

Please sign in to comment.