Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --files-from and --files-from0 options #321

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions attic/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, \
get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \
Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
is_cachedir, bigint_to_int
is_cachedir, bigint_to_int, iter_delim, FileType
from attic.remote import RepositoryServer, RemoteRepository


Expand Down Expand Up @@ -116,6 +116,10 @@ def do_create(self, args):
skip_inodes.add((st.st_ino, st.st_dev))
except IOError:
pass
for f in args.filelists:
self._process_filelist(archive, cache, skip_inodes, f)
if not (f is sys.stdin or f is getattr(sys.stdin, 'buffer', None)):
f.close()
for path in args.paths:
path = os.path.normpath(path)
if args.dontcross:
Expand All @@ -142,7 +146,14 @@ def do_create(self, args):
print('-' * 78)
return self.exit_code

def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path, restrict_dev):
def _process_filelist(self, archive, cache, skip_inodes, filelist):
delim = getattr(filelist, 'delim', b'\n')
for filename in iter_delim(filelist, delim=delim, delim_out=b''):
self._process(archive, cache,
excludes=[], exclude_caches=False, skip_inodes=skip_inodes,
path=os.fsdecode(filename), restrict_dev=False, recurse=False)

def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path, restrict_dev, recurse=True):
if exclude_path(path, excludes):
return
try:
Expand All @@ -168,6 +179,8 @@ def _process(self, archive, cache, excludes, exclude_caches, skip_inodes, path,
if exclude_caches and is_cachedir(path):
return
archive.process_item(path, st)
if not recurse:
return
try:
entries = os.listdir(path)
except OSError as e:
Expand Down Expand Up @@ -544,6 +557,14 @@ def run(self, args=None):
subparser.add_argument('--exclude-caches', dest='exclude_caches',
action='store_true', default=False,
help='exclude directories that contain a CACHEDIR.TAG file (http://www.brynosaurus.com/cachedir/spec.html)')
subparser.add_argument('--files-from', dest='filelists',
type=FileType('rb'), action='append', default=[],
metavar='FILELIST',
help='read a list of files to backup from FILELIST, separated by newlines')
subparser.add_argument('--files-from0', dest='filelists',
type=FileType('rb', delim=b'\0'), action='append', default=[],
metavar='FILELIST',
help='read a list of files to backup from FILELIST, separated by NUL characters')
subparser.add_argument('-c', '--checkpoint-interval', dest='checkpoint_interval',
type=int, default=300, metavar='SECONDS',
help='write checkpoint every SECONDS seconds (Default: 300)')
Expand All @@ -556,7 +577,7 @@ def run(self, args=None):
subparser.add_argument('archive', metavar='ARCHIVE',
type=location_validator(archive=True),
help='archive to create')
subparser.add_argument('paths', metavar='PATH', nargs='+', type=str,
subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
help='paths to archive')

extract_epilog = textwrap.dedent("""
Expand Down
57 changes: 57 additions & 0 deletions attic/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,63 @@ def make_path_safe(path):
"""
return _safe_re.sub('', path) or '.'

def iter_delim(f, delim='\n', delim_out=None, read_size=4096):
"""Iterate through a file object's contents, given a delimiter.

This function returns an iterator based on the contents of the
file-like object f. The contents will be split into chunks based
on delim, and each chunk is returned by the iterator created. By
default, the original delimiter is retained, but a replacement can
be specified using delim_out.

Both text and binary files are supported, but the type of delim
and delim_out must match the file type, i.e. they must be strings
for text files, and bytes for binary files.

"""
if delim_out is None:
delim_out = delim
bufs = []
empty = None
while True:
data = f.read(read_size)
if not data:
break
if empty is None:
empty = '' if isinstance(data, str) else b''
start = 0
while True:
pos = data.find(delim, start)
if pos < 0:
break
yield empty.join(bufs) + data[start:pos] + delim_out
start = pos + len(delim)
bufs = []
if start < len(data):
bufs.append(data[start:])
if len(bufs) > 0:
yield empty.join(bufs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add some unit tests just for this function.
consider edge cases like:
empty input file
file starts/ends with delim
2 delims directly following each other


class FileType(argparse.FileType):
"""Extended version of argparse.FileType.

Allows to specify additional attributes to be set on the returned
file objects.

"""
def __init__(self, mode='r', bufsize=-1, **kwargs):
super().__init__(mode=mode, bufsize=bufsize)
self._attrs = kwargs
self._binary = 'b' in mode

def __call__(self, string):
result = super().__call__(string)
# Work around http://bugs.python.org/issue14156
if self._binary and (result is sys.stdin or result is sys.stdout):
result = result.buffer
for key, value in self._attrs.items():
setattr(result, key, value)
return result

def daemonize():
"""Detach process from controlling terminal and run in background
Expand Down
25 changes: 25 additions & 0 deletions attic/testsuite/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import tempfile
import time
import unittest
import itertools
from hashlib import sha256
from attic import xattr
from attic.archive import Archive, ChunkBuffer
Expand Down Expand Up @@ -59,6 +60,9 @@ def __exit__(self, *args, **kw):
if v is not None:
os.environ[k] = v

def listdir_recursive(dirname):
return itertools.chain(*[[os.path.normpath(os.path.join(dirpath, f)) for f in filenames]
for dirpath, dirnames, filenames in os.walk(dirname)])

class ArchiverTestCaseBase(AtticTestCase):

Expand Down Expand Up @@ -262,6 +266,27 @@ def test_exclude_caches(self):
self.assert_equal(sorted(os.listdir('output/input')), ['cache2', 'file1'])
self.assert_equal(sorted(os.listdir('output/input/cache2')), ['CACHEDIR.TAG'])

def test_files_from(self):
self._test_files_from_option(delim=b'\n', option='--files-from')

def test_files_from0(self):
self._test_files_from_option(delim=b'\0', option='--files-from0')

def _test_files_from_option(self, *, delim, option):
self.attic('init', self.repository_location)
for filename in ['file1', 'non-listed/file', 'listed/file']:
self.create_regular_file(filename, size=1024 * 80)
listed_files = sorted(['file1', 'listed/file'])
self.create_regular_file('filelist',
contents=delim.join([os.path.join('input', f).encode('ascii')
for f in listed_files]))
self.attic('create', option + '=input/filelist', self.repository_location + '::test')
with changedir('output'):
self.attic('extract', self.repository_location + '::test')
with changedir('output/input'):
present_files = sorted(listdir_recursive('.'))
self.assert_equal(present_files, listed_files)

def test_path_normalization(self):
self.attic('init', self.repository_location)
self.create_regular_file('dir1/dir2/file', size=1024 * 80)
Expand Down
124 changes: 123 additions & 1 deletion attic/testsuite/helpers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import hashlib
from time import mktime, strptime
from datetime import datetime, timezone, timedelta
import sys
import os
import io
import tempfile
import unittest
from attic.helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, UpgradableLock, prune_within, prune_split, to_localtime, \
StableDict, int_to_bigint, bigint_to_int, parse_timestamp
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, iter_delim, FileType
from attic.testsuite import AtticTestCase
import msgpack

Expand Down Expand Up @@ -216,3 +218,123 @@ class TestParseTimestamp(AtticTestCase):
def test(self):
self.assert_equal(parse_timestamp('2015-04-19T20:25:00.226410'), datetime(2015, 4, 19, 20, 25, 0, 226410, timezone.utc))
self.assert_equal(parse_timestamp('2015-04-19T20:25:00'), datetime(2015, 4, 19, 20, 25, 0, 0, timezone.utc))


class TestIterDelim(AtticTestCase):
def test_text_basic(self):
self.assert_equal(self._delim_text(''), [])
self.assert_equal(self._delim_text('last line'), ['last line'])
self.assert_equal(self._delim_text('first line\nsecond line\n'),
['first line\n', 'second line\n'])
self.assert_equal(self._delim_text('line 1\n\nline 3'),
['line 1\n', '\n', 'line 3'])
self.assert_equal(self._delim_text('\n\nline 3\n'),
['\n', '\n', 'line 3\n'])

def test_text_delim_out(self):
self.assert_equal(self._delim_text('last line', delim_out='\r\n'),
['last line'])
self.assert_equal(self._delim_text('first line\nsecond line\n', delim_out=''),
['first line', 'second line'])
self.assert_equal(self._delim_text('line 1\n\nline 3', delim_out=''),
['line 1', '', 'line 3'])
self.assert_equal(self._delim_text('\n\nline 3\n', delim_out=''),
['', '', 'line 3'])
self.assert_equal(self._delim_text('\n\nline 3\n\nline 5', delim_out='\r\n'),
['\r\n', '\r\n', 'line 3\r\n', '\r\n', 'line 5'])

def test_text_crlf(self):
self.assert_equal(self._delim_text('last line', delim='\r\n', delim_out='\n'),
['last line'])
self.assert_equal(self._delim_text('first line\nsecond line\r\n',
delim='\r\n', delim_out='\0'),
['first line\nsecond line\0'])
self.assert_equal(self._delim_text('line 1\r\n\r\nline 3',
delim='\r\n', delim_out=''),
['line 1', '', 'line 3'])
self.assert_equal(self._delim_text('\r\n\r\nline 3\n', delim='\r\n', delim_out=''),
['', '', 'line 3\n'])
self.assert_equal(self._delim_text('\r\n\r\nline 3\nline 5',
delim='\r\n', delim_out='\0'),
['\0', '\0', 'line 3\nline 5'])

def test_binary_basic(self):
self.assert_equal(self._delim_binary(b''), [])
self.assert_equal(self._delim_binary(b'last line'), [b'last line'])
self.assert_equal(self._delim_binary(b'first line\0second line\0'),
[b'first line\0', b'second line\0'])
self.assert_equal(self._delim_binary(b'line 1\0\0line 3'),
[b'line 1\0', b'\0', b'line 3'])
self.assert_equal(self._delim_binary(b'\0\0line 3\0'),
[b'\0', b'\0', b'line 3\0'])

def test_binary_delim_out(self):
self.assert_equal(self._delim_binary(b'last line', delim_out=b'\r\n'),
[b'last line'])
self.assert_equal(self._delim_binary(b'first line\0second line\0', delim_out=b''),
[b'first line', b'second line'])
self.assert_equal(self._delim_binary(b'line 1\0\0line 3', delim_out=b''),
[b'line 1', b'', b'line 3'])
self.assert_equal(self._delim_binary(b'\0\0line 3\0', delim_out=b''),
[b'', b'', b'line 3'])
self.assert_equal(self._delim_binary(b'\0\0line 3\0\0line 5', delim_out=b'\r\n'),
[b'\r\n', b'\r\n', b'line 3\r\n', b'\r\n', b'line 5'])

def test_binary_crlf(self):
self.assert_equal(self._delim_binary(b'last line', delim=b'\r\n', delim_out=b'\n'),
[b'last line'])
self.assert_equal(self._delim_binary(b'first line\nsecond line\r\n',
delim=b'\r\n', delim_out=b'\0'),
[b'first line\nsecond line\0'])
self.assert_equal(self._delim_binary(b'line 1\r\n\r\nline 3',
delim=b'\r\n', delim_out=b''),
[b'line 1', b'', b'line 3'])
self.assert_equal(self._delim_binary(b'\r\n\r\nline 3\n', delim=b'\r\n', delim_out=b''),
[b'', b'', b'line 3\n'])
self.assert_equal(self._delim_binary(b'\r\n\r\nline 3\0line 5',
delim=b'\r\n', delim_out=b'\0'),
[b'\0', b'\0', b'line 3\0line 5'])

def _delim_text(self, text, delim='\n', delim_out=None):
return list(iter_delim(io.StringIO(text), delim=delim, delim_out=delim_out))

def _delim_binary(self, content, delim=b'\0', delim_out=None):
return list(iter_delim(io.BytesIO(content), delim=delim, delim_out=delim_out))


class TestFileType(AtticTestCase):
def test_attr(self):
f = FileType(delim='\0', foobar=42)('-')
self.assert_equal(f.delim, '\0')
self.assert_equal(f.foobar, 42)

def test_text_stdin(self):
f = FileType(mode='r', delim='\n')('-')
self.assert_equal(f.delim, '\n')
self.assert_true(isinstance(f, io.IOBase))
self.assert_equal(isinstance(f, io.TextIOBase), True)

def test_text_stdout(self):
f = FileType(mode='w', delim='\n')('-')
self.assert_equal(f.delim, '\n')
self.assert_true(isinstance(f, io.IOBase))
self.assert_equal(isinstance(f, io.TextIOBase), True)

def test_binary_stdin(self):
f = FileType(mode='rb', delim=b'\0')('-')
self.assert_equal(f.delim, b'\0')
self.assert_true(isinstance(f, io.IOBase))
self.assert_equal(isinstance(f, io.TextIOBase), False)

def test_binary_stdout(self):
# We cannot use the unittest.skipIf decorator here, because
# during decorator evaluation, sys.stdout is still having the
# 'buffer' attribute, but it gets lost before actually running
# the test, if the test suite is run with the '-b' ("Buffer
# stdout and stderr during tests") argument.
if not hasattr(sys.stdout, 'buffer'):
self.skipTest('Need sys.stdout.buffer')
f = FileType(mode='wb', delim=b'\0')('-')
self.assert_equal(f.delim, b'\0')
self.assert_true(isinstance(f, io.IOBase))
self.assert_equal(isinstance(f, io.TextIOBase), False)