Skip to content

gh-132983: Add compression.zstd and Python tests #133365

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 56 commits into from
May 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
f4cd026
Add Python files
emmatyping Apr 30, 2025
4168895
Fix byteswarning in test
emmatyping Apr 30, 2025
a22fa9b
Remove shape tests
emmatyping May 3, 2025
e70e03b
Make namedtuples dataclasses
emmatyping May 4, 2025
cbf0ef8
Apply suggestions from AA-Turner
emmatyping May 4, 2025
298b369
Clean up chunk calculations in train_dict
emmatyping May 4, 2025
a22be68
Fix _CLValues instantiation
emmatyping May 4, 2025
b30ed02
More cleanup of train_/finalize_dict
emmatyping May 4, 2025
307a894
Have train_/finalize_dict take tuple not list
emmatyping May 4, 2025
dd716a4
Ensure trailing data raises errors
emmatyping May 4, 2025
9b4765b
Remove paramter bounds caching and unsupported...
emmatyping May 4, 2025
214cd60
Use kwargs for code clarity
emmatyping May 4, 2025
e1f53b1
Clean up imports in zstd tests
emmatyping May 4, 2025
4c00026
Use _1K instead of 1024 in tests
emmatyping May 4, 2025
99653d2
Move compression.zstd.zstdfile to compression.zstd._zstdfile
emmatyping May 4, 2025
e403a25
Change compressLevel_values to COMPRESSION_LEVEL_DEFAULT
emmatyping May 4, 2025
63625bc
Fix tests for change in error message
emmatyping May 4, 2025
1ea4b9a
Make parameter names snake case
emmatyping May 4, 2025
ad05da8
Replace compressionLevel_values re-export with COMPRESSION_LEVEL_DEFAULT
emmatyping May 4, 2025
e82e23d
Move zstd_support_multithread to tests and rename
emmatyping May 4, 2025
7801b6b
Update module docstring for compression.zstd
emmatyping May 4, 2025
df5d827
Clarify Strategy stability in docstring
emmatyping May 4, 2025
4ff48da
Fix formatting in tarfile
emmatyping May 4, 2025
c68a896
Remove zstd_support_multithread from __all__
emmatyping May 4, 2025
326400d
Add test_name from upstream
emmatyping May 4, 2025
2c0c9a1
Don't close tarfile if there is a BaseException
emmatyping May 4, 2025
49f3821
Use options kwarg in tests
emmatyping May 4, 2025
8ba6bda
Use options kwarg in tests in more places
emmatyping May 4, 2025
129d5e6
Adopt suggestions by Tomas R. for _zstdfile
emmatyping May 4, 2025
7d54d35
Formatting fixes in zstd tests
emmatyping May 4, 2025
03795ec
Improve docstrings for (de)compress
emmatyping May 4, 2025
01fcfcb
Fix some line length issues
emmatyping May 4, 2025
f04494c
Improve docstring on C/DParameter.bounds()
emmatyping May 4, 2025
caa40b1
Improve docstrings and formatting
emmatyping May 4, 2025
4584ec5
Add missing f string prefix
emmatyping May 4, 2025
3cafdc6
Fix weird indent in _zstdfile.py
emmatyping May 4, 2025
8cb0846
Use io.open instead of builtins.open
emmatyping May 5, 2025
c7d5d67
Remove _READER_CLASS from ZstdFile
emmatyping May 5, 2025
a56a22e
Adopt many suggestions from AA-Turner for ZstdFile
emmatyping May 5, 2025
7e919c8
Set self._buffer to None
emmatyping May 5, 2025
389faed
Move _nbytes to _zstdfile.py
emmatyping May 5, 2025
006ef2e
Move test_zstd to file
emmatyping May 5, 2025
c846b78
Rename C/DParameter to (De)CompressionParameter
emmatyping May 5, 2025
fa0cb0c
regen clinic
AA-Turner May 5, 2025
74e4d2b
Fix whitespace issue
emmatyping May 5, 2025
03fff3d
Remove makefile test dir
emmatyping May 5, 2025
a99c5dd
swap order of parameters in _get_param_bounds
AA-Turner May 5, 2025
a12a031
Merge branch 'main' into 3.14-zstd-python-code
emmatyping May 5, 2025
bf94aad
Sort imports
AA-Turner May 5, 2025
5b45ec7
Improve docstrings
AA-Turner May 5, 2025
b0eca5a
Remove comments
AA-Turner May 5, 2025
c0d0e10
Remove unused private variables
AA-Turner May 5, 2025
10f0cff
Misc changes (positional-only, style, error messages)
AA-Turner May 5, 2025
7f8c350
whitespace
AA-Turner May 5, 2025
bf4b07d
Remove _set_parameter_types
AA-Turner May 5, 2025
eaf46a8
Revert "Remove _set_parameter_types"
emmatyping May 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 234 additions & 0 deletions Lib/compression/zstd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""

__all__ = (
# compression.zstd
"COMPRESSION_LEVEL_DEFAULT",
"compress",
"CompressionParameter",
"decompress",
"DecompressionParameter",
"finalize_dict",
"get_frame_info",
"Strategy",
"train_dict",

# compression.zstd._zstdfile
"open",
"ZstdFile",

# _zstd
"get_frame_size",
"zstd_version",
"zstd_version_info",
"ZstdCompressor",
"ZstdDecompressor",
"ZstdDict",
"ZstdError",
)

import _zstd
import enum
from _zstd import *
from compression.zstd._zstdfile import ZstdFile, open, _nbytes

COMPRESSION_LEVEL_DEFAULT = _zstd._compressionLevel_values[0]
"""The default compression level for Zstandard, currently '3'."""


class FrameInfo:
"""Information about a Zstandard frame."""
__slots__ = 'decompressed_size', 'dictionary_id'

def __init__(self, decompressed_size, dictionary_id):
super().__setattr__('decompressed_size', decompressed_size)
super().__setattr__('dictionary_id', dictionary_id)

def __repr__(self):
return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
f'dictionary_id={self.dictionary_id})')

def __setattr__(self, name, _):
raise AttributeError(f"can't set attribute {name!r}")


def get_frame_info(frame_buffer):
"""Get Zstandard frame information from a frame header.

*frame_buffer* is a bytes-like object. It should start from the beginning
of a frame, and needs to include at least the frame header (6 to 18 bytes).

The returned FrameInfo object has two attributes.
'decompressed_size' is the size in bytes of the data in the frame when
decompressed, or None when the decompressed size is unknown.
'dictionary_id' is an int in the range (0, 2**32). The special value 0
means that the dictionary ID was not recorded in the frame header,
the frame may or may not need a dictionary to be decoded,
and the ID of such a dictionary is not specified.
"""
return FrameInfo(*_zstd._get_frame_info(frame_buffer))


def train_dict(samples, dict_size):
"""Return a ZstdDict representing a trained Zstandard dictionary.

*samples* is an iterable of samples, where a sample is a bytes-like
object representing a file.

*dict_size* is the dictionary's maximum size, in bytes.
"""
if not isinstance(dict_size, int):
ds_cls = type(dict_size).__qualname__
raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')

samples = tuple(samples)
chunks = b''.join(samples)
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
if not chunks:
raise ValueError("samples contained no data; can't train dictionary.")
dict_content = _zstd._train_dict(chunks, chunk_sizes, dict_size)
return ZstdDict(dict_content)


def finalize_dict(zstd_dict, /, samples, dict_size, level):
"""Return a ZstdDict representing a finalized Zstandard dictionary.

Given a custom content as a basis for dictionary, and a set of samples,
finalize *zstd_dict* by adding headers and statistics according to the
Zstandard dictionary format.

You may compose an effective dictionary content by hand, which is used as
basis dictionary, and use some samples to finalize a dictionary. The basis
dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.

*samples* is an iterable of samples, where a sample is a bytes-like object
representing a file.
*dict_size* is the dictionary's maximum size, in bytes.
*level* is the expected compression level. The statistics for each
compression level differ, so tuning the dictionary to the compression level
can provide improvements.
"""

if not isinstance(zstd_dict, ZstdDict):
raise TypeError('zstd_dict argument should be a ZstdDict object.')
if not isinstance(dict_size, int):
raise TypeError('dict_size argument should be an int object.')
if not isinstance(level, int):
raise TypeError('level argument should be an int object.')

samples = tuple(samples)
chunks = b''.join(samples)
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
if not chunks:
raise ValueError("The samples are empty content, can't finalize the"
"dictionary.")
dict_content = _zstd._finalize_dict(zstd_dict.dict_content,
chunks, chunk_sizes,
dict_size, level)
return ZstdDict(dict_content)

def compress(data, level=None, options=None, zstd_dict=None):
"""Return Zstandard compressed *data* as bytes.

*level* is an int specifying the compression level to use, defaulting to
COMPRESSION_LEVEL_DEFAULT ('3').
*options* is a dict object that contains advanced compression
parameters. See CompressionParameter for more on options.
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
the function train_dict for how to train a ZstdDict on sample data.

For incremental compression, use a ZstdCompressor instead.
"""
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)

def decompress(data, zstd_dict=None, options=None):
"""Decompress one or more frames of Zstandard compressed *data*.

*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
the function train_dict for how to train a ZstdDict on sample data.
*options* is a dict object that contains advanced compression
parameters. See DecompressionParameter for more on options.

For incremental decompression, use a ZstdDecompressor instead.
"""
results = []
while True:
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
results.append(decomp.decompress(data))
if not decomp.eof:
raise ZstdError("Compressed data ended before the "
"end-of-stream marker was reached")
data = decomp.unused_data
if not data:
break
return b"".join(results)


class CompressionParameter(enum.IntEnum):
"""Compression parameters."""

compression_level = _zstd._ZSTD_c_compressionLevel
window_log = _zstd._ZSTD_c_windowLog
hash_log = _zstd._ZSTD_c_hashLog
chain_log = _zstd._ZSTD_c_chainLog
search_log = _zstd._ZSTD_c_searchLog
min_match = _zstd._ZSTD_c_minMatch
target_length = _zstd._ZSTD_c_targetLength
strategy = _zstd._ZSTD_c_strategy

enable_long_distance_matching = _zstd._ZSTD_c_enableLongDistanceMatching
ldm_hash_log = _zstd._ZSTD_c_ldmHashLog
ldm_min_match = _zstd._ZSTD_c_ldmMinMatch
ldm_bucket_size_log = _zstd._ZSTD_c_ldmBucketSizeLog
ldm_hash_rate_log = _zstd._ZSTD_c_ldmHashRateLog

content_size_flag = _zstd._ZSTD_c_contentSizeFlag
checksum_flag = _zstd._ZSTD_c_checksumFlag
dict_id_flag = _zstd._ZSTD_c_dictIDFlag

nb_workers = _zstd._ZSTD_c_nbWorkers
job_size = _zstd._ZSTD_c_jobSize
overlap_log = _zstd._ZSTD_c_overlapLog

def bounds(self):
"""Return the (lower, upper) int bounds of a compression parameter.

Both the lower and upper bounds are inclusive.
"""
return _zstd._get_param_bounds(self.value, is_compress=True)


class DecompressionParameter(enum.IntEnum):
"""Decompression parameters."""

window_log_max = _zstd._ZSTD_d_windowLogMax

def bounds(self):
"""Return the (lower, upper) int bounds of a decompression parameter.

Both the lower and upper bounds are inclusive.
"""
return _zstd._get_param_bounds(self.value, is_compress=False)


class Strategy(enum.IntEnum):
"""Compression strategies, listed from fastest to strongest.

Note that new strategies might be added in the future.
Only the order (from fast to strong) is guaranteed,
the numeric value might change.
"""

fast = _zstd._ZSTD_fast
dfast = _zstd._ZSTD_dfast
greedy = _zstd._ZSTD_greedy
lazy = _zstd._ZSTD_lazy
lazy2 = _zstd._ZSTD_lazy2
btlazy2 = _zstd._ZSTD_btlazy2
btopt = _zstd._ZSTD_btopt
btultra = _zstd._ZSTD_btultra
btultra2 = _zstd._ZSTD_btultra2


# Check validity of the CompressionParameter & DecompressionParameter types
_zstd._set_parameter_types(CompressionParameter, DecompressionParameter)
Loading
Loading