Skip to content

Commit

Permalink
Improved control over frame size in zstd filter.
Browse files Browse the repository at this point in the history
Instead of just `min-frame-size` and `max-frame-size`, we now have four separate options:

* `min-frame-in` delays the creation of a new frame on flush until the uncompressed size of the current frame passes a certain threshold.

* `min-frame-out` delays the creation of a new frame on flush until the compressed size of the current frame passes a certain threshold.

* `max-frame-in` forces the creation of a new frame as soon as possible after the uncompressed size of the current frame reaches a certain limit.

* `max-frame-out` forces the creation of a new frame as soon as possible after the compressed size of the current frame reaches a certain limit.

We now also support `k`, `kB`, `M`, `MB`, `G` and `GB` suffixes for all four options.

The old options are retained as aliases for the corresponding new options.
  • Loading branch information
dag-erling committed Mar 1, 2024
1 parent d8ec28c commit fae3882
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 38 deletions.
92 changes: 71 additions & 21 deletions libarchive/archive_write_add_filter_zstd.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 2017 Sean Purcell
* Copyright (c) 2023-2024 Klara, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -59,8 +60,10 @@ struct private_data {
resetting,
} state;
int frame_per_file;
size_t min_frame_size;
size_t max_frame_size;
size_t min_frame_in;
size_t max_frame_in;
size_t min_frame_out;
size_t max_frame_out;
size_t cur_frame;
size_t cur_frame_in;
size_t cur_frame_out;
Expand Down Expand Up @@ -129,8 +132,10 @@ archive_write_add_filter_zstd(struct archive *_a)
data->long_distance = 0;
#if HAVE_ZSTD_H && HAVE_LIBZSTD_COMPRESSOR
data->frame_per_file = 0;
data->min_frame_size = 0;
data->max_frame_size = SIZE_MAX;
data->min_frame_in = 0;
data->max_frame_in = SIZE_MAX;
data->min_frame_out = 0;
data->max_frame_out = SIZE_MAX;
data->cur_frame_in = 0;
data->cur_frame_out = 0;
data->cstream = ZSTD_createCStream();
Expand Down Expand Up @@ -170,7 +175,8 @@ archive_compressor_zstd_free(struct archive_write_filter *f)
return (ARCHIVE_OK);
}

static int string_to_number(const char *string, intmax_t *numberp)
static int
string_to_number(const char *string, intmax_t *numberp)
{
char *end;

Expand All @@ -184,6 +190,41 @@ static int string_to_number(const char *string, intmax_t *numberp)
return (ARCHIVE_OK);
}

static int
string_to_size(const char *string, size_t *numberp)
{
uintmax_t number;
char *end;
unsigned int shift = 0;

if (string == NULL || *string == '\0' || *string == '-')
return (ARCHIVE_WARN);
number = strtoumax(string, &end, 10);
if (end > string) {
if (*end == 'K' || *end == 'k') {
shift = 10;
end++;
} else if (*end == 'M' || *end == 'm') {
shift = 20;
end++;
} else if (*end == 'G' || *end == 'g') {
shift = 30;
end++;
}
if (*end == 'B' || *end == 'b') {
end++;
}
}
if (end == string || *end != '\0' || errno == EOVERFLOW) {
return (ARCHIVE_WARN);
}
if (number > (uintmax_t)SIZE_MAX >> shift) {
return (ARCHIVE_WARN);
}
*numberp = (size_t)(number << shift);
return (ARCHIVE_OK);
}

/*
* Set write options.
*/
Expand Down Expand Up @@ -232,25 +273,29 @@ archive_compressor_zstd_options(struct archive_write_filter *f, const char *key,
} else if (strcmp(key, "frame-per-file") == 0) {
data->frame_per_file = 1;
return (ARCHIVE_OK);
} else if (strcmp(key, "min-frame-size") == 0) {
intmax_t min_frame_size;
if (string_to_number(value, &min_frame_size) != ARCHIVE_OK) {
} else if (strcmp(key, "min-frame-in") == 0) {
if (string_to_size(value, &data->min_frame_in) != ARCHIVE_OK) {
return (ARCHIVE_WARN);
}
if (min_frame_size < 0) {
return (ARCHIVE_OK);
} else if (strcmp(key, "min-frame-out") == 0 ||
strcmp(key, "min-frame-size") == 0) {
if (string_to_size(value, &data->min_frame_out) != ARCHIVE_OK) {
return (ARCHIVE_WARN);
}
data->min_frame_size = min_frame_size;
return (ARCHIVE_OK);
} else if (strcmp(key, "max-frame-size") == 0) {
intmax_t max_frame_size;
if (string_to_number(value, &max_frame_size) != ARCHIVE_OK) {
} else if (strcmp(key, "max-frame-in") == 0 ||
strcmp(key, "max-frame-size") == 0) {
if (string_to_size(value, &data->max_frame_in) != ARCHIVE_OK ||
data->max_frame_in < 1024) {
return (ARCHIVE_WARN);
}
if (max_frame_size < 1024) {
return (ARCHIVE_OK);
} else if (strcmp(key, "max-frame-out") == 0) {
if (string_to_size(value, &data->max_frame_out) != ARCHIVE_OK ||
data->max_frame_out < 1024) {
return (ARCHIVE_WARN);
}
data->max_frame_size = max_frame_size;
return (ARCHIVE_OK);
#endif
}
Expand Down Expand Up @@ -353,9 +398,12 @@ archive_compressor_zstd_flush(struct archive_write_filter *f)
{
struct private_data *data = (struct private_data *)f->data;

if (data->frame_per_file && data->state == running &&
data->cur_frame_out > data->min_frame_size)
data->state = finishing;
if (data->frame_per_file && data->state == running) {
if (data->cur_frame_in > data->min_frame_in &&
data->cur_frame_out > data->min_frame_out) {
data->state = finishing;
}
}
return (drive_compressor(f, data, 1, NULL, 0));
}

Expand Down Expand Up @@ -414,9 +462,11 @@ drive_compressor(struct archive_write_filter *f,
data->total_in += in.pos - ipos;
data->cur_frame_in += in.pos - ipos;
data->cur_frame_out += data->out.pos - opos;
if (data->state == running &&
data->cur_frame_in >= data->max_frame_size) {
data->state = finishing;
if (data->state == running) {
if (data->cur_frame_in >= data->max_frame_in ||
data->cur_frame_out >= data->max_frame_out) {
data->state = finishing;
}
}
if (data->out.pos == data->out.size ||
(flush && data->out.pos > 0)) {
Expand Down
94 changes: 82 additions & 12 deletions libarchive/test/test_write_filter_zstd.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,28 +136,98 @@ DEFINE_TEST(test_write_filter_zstd)
/* frame-per-file: boolean */
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "frame-per-file", ""));
/* min-frame-size: >= 0 */
/* min-frame-in: >= 0 */
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "min-frame-size", ""));
archive_write_set_filter_option(a, NULL, "min-frame-out", ""));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "min-frame-size", "-1"));
archive_write_set_filter_option(a, NULL, "min-frame-out", "-1"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-size", "0"));
archive_write_set_filter_option(a, NULL, "min-frame-out", "0"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-size", "1048576"));
/* max-frame-size: >= 1024 */
archive_write_set_filter_option(a, NULL, "min-frame-out", "1048576"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1k"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1kB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1M"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1MB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1G"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-out", "1GB"));
/* min-frame-out: >= 0 */
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "min-frame-in", ""));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "min-frame-in", "-1"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "0"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1048576"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1k"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1kB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1M"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1MB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1G"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "min-frame-in", "1GB"));
/* max-frame-in: >= 1024 */
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-in", ""));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-size", ""));
archive_write_set_filter_option(a, NULL, "max-frame-in", "-1"));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-size", "-1"));
archive_write_set_filter_option(a, NULL, "max-frame-in", "0"));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-size", "0"));
archive_write_set_filter_option(a, NULL, "max-frame-in", "1023"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1024"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1048576"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1k"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1kB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1M"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1MB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1G"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-in", "1GB"));
/* max-frame-out: >= 1024 */
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-out", ""));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-size", "1023"));
archive_write_set_filter_option(a, NULL, "max-frame-out", "-1"));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-out", "0"));
assertEqualIntA(a, ARCHIVE_FAILED,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1023"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1024"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1048576"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1k"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1kB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1M"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-out", "1MB"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-size", "1024"));
archive_write_set_filter_option(a, NULL, "max-frame-out", "1G"));
assertEqualIntA(a, ARCHIVE_OK,
archive_write_set_filter_option(a, NULL, "max-frame-size", "1048576"));
archive_write_set_filter_option(a, NULL, "max-frame-out", "1GB"));
#endif
#if ZSTD_VERSION_NUMBER >= MINVER_LONG
if ((int)(sizeof(size_t) == 4))
Expand Down
50 changes: 45 additions & 5 deletions tar/bsdtar.1
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd December 10, 2023
.Dd March 1, 2024
.Dt TAR 1
.Os
.Sh NAME
Expand Down Expand Up @@ -655,16 +655,56 @@ use as many threads as there are CPU cores on the system.
.It Cm zstd:frame-per-file
Start a new compression frame at the beginning of each file in the
archive.
.It Cm zstd:min-frame-size Ns = Ns Ar N
.It Cm zstd:min-frame-in Ns = Ns Ar N
In combination with
.Cm zstd:frame-per-file ,
do not start a new compression frame unless the current frame is at least
do not start a new compression frame unless the uncompressed size of
the current frame is at least
.Ar N
bytes.
.It Cm zstd:max-frame-size Ns = Ns Ar N
Start a new compression frame as soon as the current frame exceeds
The number may be followed by
.Li k / Li kB ,
.Li M / Li MB ,
or
.Li G / Li GB
to indicate kilobytes, megabytes or gigabytes respectively.
.It Cm zstd:min-frame-out Ns = Ns Ar N , Cm zstd:min-frame-size Ns = Ns Ar N
In combination with
.Cm zstd:frame-per-file ,
do not start a new compression frame unless the compressed size of the
current frame is at least
.Ar N
bytes.
The number may be followed by
.Li k / Li kB ,
.Li M / Li MB ,
or
.Li G / Li GB
to indicate kilobytes, megabytes or gigabytes respectively.
.It Cm zstd:max-frame-in Ns = Ns Ar N , Cm zstd:max-frame-size Ns = Ns Ar N
Start a new compression frame as soon as possible after the
uncompressed size of the current frame exceeds
.Ar N
bytes.
The number may be followed by
.Li k / Li kB ,
.Li M / Li MB ,
or
.Li G / Li GB
to indicate kilobytes, megabytes or gigabytes respectively.
Values less than 1,024 will be rejected.
.It Cm zstd:max-frame-out Ns = Ns Ar N
Start a new compression frame as soon as possible after the compressed
size of the current frame exceeds
.Ar N
bytes.
The number may be followed by
.Li k / Li kB ,
.Li M / Li MB ,
or
.Li G / Li GB
to indicate kilobytes, megabytes or gigabytes respectively.
Values less than 1,024 will be rejected.
.It Cm lzop:compression-level
A decimal integer from 1 to 9 specifying the lzop compression level.
.It Cm xz:compression-level
Expand Down

0 comments on commit fae3882

Please sign in to comment.