Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --files-from and --files-from0 options #321

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions attic/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, \
get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
is_cachedir, bigint_to_int
is_cachedir, bigint_to_int, iter_delim, FileType
from attic.remote import RepositoryServer, RemoteRepository


Expand Down Expand Up @@ -116,6 +116,10 @@ def do_create(self, args):
skip_inodes.add((st.st_ino, st.st_dev))
except IOError:
pass
for f in args.filelists:
self._process_filelist(archive, cache, skip_inodes, f)
if not (f is sys.stdin or f is getattr(sys.stdin, 'buffer', None)):
f.close()
for path in args.paths:
path = os.path.normpath(path)
if args.dontcross:
Expand All @@ -142,7 +146,14 @@ def do_create(self, args):
print('-' * 78)
return self.exit_code

def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path, restrict_dev):
def _process_filelist(self, archive, cache, skip_inodes, filelist):
delim = getattr(filelist, 'delim', b'\n')
for filename in iter_delim(filelist, delim=delim, delim_out=b''):
self._process(archive, cache,
excludes=[], exclude_caches=False, skip_inodes=skip_inodes,
path=os.fsdecode(filename), restrict_dev=False, recurse=False)

def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path, restrict_dev, recurse=True):
if exclude_path(path, excludes):
return
try:
Expand All @@ -168,6 +179,8 @@ def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path,
if exclude_caches and is_cachedir(path):
return
archive.process_item(path, st)
if not recurse:
return
try:
entries = os.listdir(path)
except OSError as e:
Expand Down Expand Up @@ -544,6 +557,14 @@ def run(self, args=None):
subparser.add_argument('--exclude-caches', dest='exclude_caches',
action='store_true', default=False,
help='exclude directories that contain a CACHEDIR.TAG file (http://www.brynosaurus.com/cachedir/spec.html)')
subparser.add_argument('--files-from', dest='filelists',
type=FileType('rb'), action='append', default=[],
metavar='FILELIST',
help='read a list of files to backup from FILELIST, separated by newlines')
subparser.add_argument('--files-from0', dest='filelists',
type=FileType('rb', delim=b'\0'), action='append', default=[],
metavar='FILELIST',
help='read a list of files to backup from FILELIST, separated by NUL characters')
subparser.add_argument('-c', '--checkpoint-interval', dest='checkpoint_interval',
type=int, default=300, metavar='SECONDS',
help='write checkpoint every SECONDS seconds (Default: 300)')
Expand All @@ -556,7 +577,7 @@ def run(self, args=None):
subparser.add_argument('archive', metavar='ARCHIVE',
type=location_validator(archive=True),
help='archive to create')
subparser.add_argument('paths', metavar='PATH', nargs='+', type=str,
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to archive')

extract_epilog = textwrap.dedent("""
Expand Down
57 changes: 57 additions & 0 deletions attic/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,63 @@ def make_path_safe(path):
"""
return _safe_re.sub('', path) or '.'

def iter_delim(f, delim='\n', delim_out=None, read_size=4096):
"""Iterate through a file object's contents, given a delimiter.

This function returns an iterator based on the contents of the
file-like object f. The contents will be split into chunks based
on delim, and each chunk is returned by the iterator created. By
default, the original delimiter is retained, but a replacement can
be specified using delim_out.

Both text and binary files are supported, but the type of delim
and delim_out must match the file type, i.e. they must be strings
for text files, and bytes for binary files.

"""
if delim_out is None:
delim_out = delim
bufs = []
empty = None
while True:
data = f.read(read_size)
if not data:
break
if empty is None:
empty = '' if isinstance(data, str) else b''
start = 0
while True:
pos = data.find(delim, start)
if pos < 0:
break
yield empty.join(bufs) + data[start:pos] + delim_out
start = pos + len(delim)
bufs = []
if start < len(data):
bufs.append(data[start:])
if len(bufs) > 0:
yield empty.join(bufs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add some unit tests just for this function.
consider edge cases like:
empty input file
file starts/ends with delim
2 delims directly following each other


class FileType(argparse.FileType):
"""Extended version of argparse.FileType.

Allows to specify additional attributes to be set on the returned
file objects.

"""
def __init__(self, mode='r', bufsize=-1, **kwargs):
super().__init__(mode=mode, bufsize=bufsize)
self._attrs = kwargs
self._binary = 'b' in mode

def __call__(self, string):
result = super().__call__(string)
# Work around http://bugs.python.org/issue14156
if self._binary and result is sys.stdin or result is sys.stdout:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the line above doing what you want?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Manual tests using stdin work (this is my primary use case, after all), however, to add automatic tests, one would need to extend 'ArchiverTestCaseBase.attic()' to emulate sys.stdin.buffer to enable binary input from stdin.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you mean it like this: if (self._binary and result is sys.stdin) or result is sys.stdout: ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TW notifications@github.com writes:

+class FileType(argparse.FileType):
+

  • [...]
  • def call(self, string):
  •    result = super().**call**(string)
    
  •    # Work around http://bugs.python.org/issue14156
    
  •    if self._binary and result is sys.stdin or result is sys.stdout:
    

do you mean it like this: if (self._binary and result is sys.stdin)
or result is sys.stdout: ?

Oh, good catch, there is a set of parens missing (should be 'if
self._binary and (result is sys.stdin or result is
sys.stdout)'. Obviously, I didn't test with sys.stdout. I'll fix this,
along with adding a unit test for this class.

Thanks, Rotty

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing that occurred to me, and which i'm not exactly sure what to do about, is that the code in FileType.__call__() monkey-patches (i.e., adds attributes to) sys.stdin or sys.stderr, if they are the result of super().__call__().

I think this monkey-patching is fine for newly-created file objects, but less so for sys.stdin and sys.stdout. It might however be acceptable in the context of an application (as opposed to a general-purpose library). Do you have an opinion on that topic?

result = result.buffer
for key, value in self._attrs.items():
setattr(result, key, value)
return result

def daemonize():
"""Detach process from controlling terminal and run in background
Expand Down
25 changes: 25 additions & 0 deletions attic/testsuite/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import tempfile
import time
import unittest
import itertools
from hashlib import sha256
from attic import xattr
from attic.archive import Archive, ChunkBuffer
Expand Down Expand Up @@ -59,6 +60,9 @@ def __exit__(self, *args, **kw):
if v is not None:
os.environ[k] = v

def listdir_recursive(dirname):
return itertools.chain(*[[os.path.normpath(os.path.join(dirpath, f)) for f in filenames]
for dirpath, dirnames, filenames in os.walk(dirname)])

class ArchiverTestCaseBase(AtticTestCase):

Expand Down Expand Up @@ -262,6 +266,27 @@ def test_exclude_caches(self):
self.assert_equal(sorted(os.listdir('output/input')), ['cache2', 'file1'])
self.assert_equal(sorted(os.listdir('output/input/cache2')), ['CACHEDIR.TAG'])

def test_files_from(self):
self._test_files_from_option(delim=b'\n', option='--files-from')

def test_files_from0(self):
self._test_files_from_option(delim=b'\0', option='--files-from0')

def _test_files_from_option(self, *, delim, option):
self.attic('init', self.repository_location)
for filename in ['file1', 'non-listed/file', 'listed/file']:
self.create_regular_file(filename, size=1024 * 80)
listed_files = sorted(['file1', 'listed/file'])
self.create_regular_file('filelist',
contents=delim.join([os.path.join('input', f).encode('ascii')
for f in listed_files]))
self.attic('create', option + '=input/filelist', self.repository_location + '::test')
with changedir('output'):
self.attic('extract', self.repository_location + '::test')
with changedir('output/input'):
present_files = sorted(listdir_recursive('.'))
self.assert_equal(present_files, listed_files)

def test_path_normalization(self):
self.attic('init', self.repository_location)
self.create_regular_file('dir1/dir2/file', size=1024 * 80)
Expand Down