Skip to content

Commit

Permalink
Add support for --wrap option
Browse files Browse the repository at this point in the history
The previous behavior of `bagit.py` was asymmetrical with respect to how
it treated long tag lines: it would handle folded lines when reading
the `bag-info.txt` file, but it would not fold long lines when writing
long tag values into `bag-info.txt`.  This was hardwired into the
function `_make_tag_file()`, which explicitly stripped line endings from
tag values before writing them.

Section 2.2.2 of the BagIt spec (https://tools.ietf.org/html/rfc8493)
states the following:

   It is RECOMMENDED that lines not exceed 79 characters in length.
   Long values MAY be continued onto the next line by inserting a LF,
   CR, or CRLF, and then indenting the next line with one or more linear
   white space characters (spaces or tabs).  Except for linebreaks, such
   padding does not form part of the value.

This commit adds a new command-line option, `--wrap`, and a new
parameter named `line_width` to the functions `make_bag()` and
`save()`, to make it possible to follow the recommendation. The
default value is 0, which means don't wrap (which is the original
behavior).  An integer value greater than 0 causes line-wrapping to be
performed on a best-effort basis to limit line lengths to the given
value.
  • Loading branch information
mhucka committed Feb 2, 2019
1 parent 8a8263e commit 80686e8
Showing 1 changed file with 36 additions and 7 deletions.
43 changes: 36 additions & 7 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import signal
import sys
import tempfile
import textwrap
import unicodedata
import warnings
from collections import defaultdict
Expand Down Expand Up @@ -137,7 +138,8 @@ def find_locale_dir():


def make_bag(
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None,
encoding="utf-8", line_length=0
):
"""
Convert a given directory into a bag. You can pass in arbitrary
Expand Down Expand Up @@ -256,7 +258,7 @@ def make_bag(
)

bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
_make_tag_file("bag-info.txt", bag_info)
_make_tag_file("bag-info.txt", bag_info, line_length)

for c in checksums:
_make_tagmanifest_file(c, bag_dir, encoding="utf-8")
Expand Down Expand Up @@ -450,7 +452,7 @@ def payload_entries(self):
if key.startswith("data" + os.sep)
)

def save(self, processes=1, manifests=False):
def save(self, processes=1, manifests=False, line_length=0):
"""
save will persist any changes that have been made to the bag
metadata (self.info).
Expand All @@ -463,6 +465,11 @@ def save(self, processes=1, manifests=False):
If you want to control the number of processes that are used when
recalculating checksums use the processes parameter.
If you want long tag values to be wrapped by breaking long strings at
whitespace characters, set line_length to a value greater than 0. An
integer value greater than 0 causes line-wrapping to be performed on
a best-effort basis to limit line lengths to the given value.
"""
# Error checking
if not self.path:
Expand Down Expand Up @@ -514,7 +521,7 @@ def save(self, processes=1, manifests=False):
LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name)
self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)

_make_tag_file(self.tag_file_name, self.info)
_make_tag_file(self.tag_file_name, self.info, line_length)

# Update tag-manifest for changes to manifest & bag-info files
for alg in self.algorithms:
Expand Down Expand Up @@ -1219,16 +1226,26 @@ def _parse_tags(tag_file):
yield (tag_name, tag_value.strip())


def _make_tag_file(bag_info_path, bag_info):
def _make_tag_file(bag_info_path, bag_info, line_length):
headers = sorted(bag_info.keys())
with open_text_file(bag_info_path, "w") as f:
for h in headers:
values = bag_info[h]
if not isinstance(values, list):
values = [values]
for txt in values:
# strip CR, LF and CRLF so they don't mess up the tag file
txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt))
txt = force_unicode(txt)
if line_length > 1:
# Need to account for the length of the tag name. This
# adds an initial space, then removes it below.
txt = '\n'.join(textwrap.wrap(txt, width=line_length,
initial_indent=' '*(len(h) + 2),
break_long_words=False,
break_on_hyphens=False,
subsequent_indent=' '))
txt = txt[len(h) + 2:]
else:
txt = re.sub(r"\n|\r|(\r\n)", "", txt)
f.write("%s: %s\n" % (h, txt))


Expand Down Expand Up @@ -1499,6 +1516,17 @@ def _make_parser():
" without performing checksum validation to detect corruption."
),
)
parser.add_argument(
"--wrap",
type=int,
dest="line_length",
default=0,
help=_(
"Limit line lengths in the tag file (bag-info.txt) by"
" wrapping long values and indenting subsequent lines with a"
" space character. (Default: don't.)"
),
)

checksum_args = parser.add_argument_group(
_("Checksum Algorithms"),
Expand Down Expand Up @@ -1596,6 +1624,7 @@ def main():
bag_info=args.bag_info,
processes=args.processes,
checksums=args.checksums,
line_length=args.line_length,
)
except Exception as exc:
LOGGER.error(
Expand Down

0 comments on commit 80686e8

Please sign in to comment.