Skip to content

BUG: Fix pandas compatibility with Python installations lacking bzip2 headers #53858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 7, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ I/O
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
- Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`)
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
- Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`)

Period
^^^^^^
Expand Down
8 changes: 5 additions & 3 deletions pandas/_testing/_io.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import bz2
import gzip
import io
import tarfile
Expand All @@ -11,7 +10,10 @@
)
import zipfile

from pandas.compat import get_lzma_file
from pandas.compat import (
get_bz2_file,
get_lzma_file,
)
from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand Down Expand Up @@ -157,7 +159,7 @@ def write_to_compressed(compression, path, data, dest: str = "test"):
elif compression == "gzip":
compress_method = gzip.GzipFile
elif compression == "bz2":
compress_method = bz2.BZ2File
compress_method = get_bz2_file()
elif compression == "zstd":
compress_method = import_optional_dependency("zstandard").open
elif compression == "xz":
Expand Down
23 changes: 23 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,29 @@ def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]:
return pandas.compat.compressors.LZMAFile


def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
"""
Importing the `BZ2File` class from the `bz2` module.

Returns
-------
class
The `BZ2File` class from the `bz2` module.

Raises
------
RuntimeError
If the `bz2` module was not imported correctly, or didn't exist.
"""
if not pandas.compat.compressors.has_bz2:
raise RuntimeError(
"bz2 module not available. "
"A Python re-install with the proper dependencies, "
"might be required to solve this issue."
)
return pandas.compat.compressors.BZ2File


__all__ = [
"is_numpy_dev",
"pa_version_under7p0",
Expand Down
30 changes: 19 additions & 11 deletions pandas/compat/compressors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@

from __future__ import annotations

import bz2
from pickle import PickleBuffer

from pandas.compat._constants import PY310

try:
import bz2

has_bz2 = True
except ImportError:
has_bz2 = False

try:
import lzma

Expand Down Expand Up @@ -41,17 +47,19 @@ def flatten_buffer(
return memoryview(b).tobytes("A")


class BZ2File(bz2.BZ2File):
if not PY310:
if has_bz2:

def write(self, b) -> int:
# Workaround issue where `bz2.BZ2File` expects `len`
# to return the number of bytes in `b` by converting
# `b` into something that meets that constraint with
# minimal copying.
#
# Note: This is fixed in Python 3.10.
return super().write(flatten_buffer(b))
class BZ2File(bz2.BZ2File):
if not PY310:

def write(self, b) -> int:
# Workaround issue where `bz2.BZ2File` expects `len`
# to return the number of bytes in `b` by converting
# `b` into something that meets that constraint with
# minimal copying.
#
# Note: This is fixed in Python 3.10.
return super().write(flatten_buffer(b))


if has_lzma:
Expand Down
8 changes: 5 additions & 3 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,11 @@
StorageOptions,
WriteBuffer,
)
from pandas.compat import get_lzma_file
from pandas.compat import (
get_bz2_file,
get_lzma_file,
)
from pandas.compat._optional import import_optional_dependency
from pandas.compat.compressors import BZ2File as _BZ2File
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

Expand Down Expand Up @@ -766,7 +768,7 @@ def get_handle(
elif compression == "bz2":
# Overload of "BZ2File" to handle pickle protocol 5
# "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
handle = _BZ2File( # type: ignore[call-overload]
handle = get_bz2_file()( # type: ignore[call-overload]
handle,
mode=ioargs.mode,
**compression_args,
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import string
import subprocess
import sys
import textwrap

import numpy as np
import pytest
Expand Down Expand Up @@ -245,3 +246,21 @@ def test_str_size():
]
result = subprocess.check_output(call).decode()[-4:-1].strip("\n")
assert int(result) == int(expected)


@pytest.mark.single_cpu
def test_bz2_missing_import():
# Check whether bz2 missing import is handled correctly (issue #53857)
code = """
import sys
sys.modules['bz2'] = None
import pytest
import pandas as pd
from pandas.compat import get_bz2_file
msg = 'bz2 module not available.'
with pytest.raises(RuntimeError, match=msg):
get_bz2_file()
"""
code = textwrap.dedent(code)
call = [sys.executable, "-c", code]
subprocess.check_output(call)