Skip to content
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 107 additions & 19 deletions bson/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@
from mmap import mmap as _mmap


_NUMPY_AVAILABLE = False
try:
import numpy as np
import numpy.typing as npt

_NUMPY_AVAILABLE = True
except ImportError:
np = None # type: ignore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC importing numpy is very very slow. In that case we should not even attempt to import it by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed to lazy imports, and used importlib.util.find_spec("numpy") as skip condition in test_bson.py.



class UuidRepresentation:
UNSPECIFIED = 0
"""An unspecified UUID representation.
Expand Down Expand Up @@ -234,13 +244,20 @@ class BinaryVector:

__slots__ = ("data", "dtype", "padding")

def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
def __init__(
self,
data: Union[Sequence[float | int], npt.NDArray[np.number]],
dtype: BinaryVectorDtype,
padding: int = 0,
):
"""
:param data: Sequence of numbers representing the mathematical vector.
:param dtype: The data type stored in binary
:param padding: The number of bits in the final byte that are to be ignored
when a vector element's size is less than a byte
and the length of the vector is not a multiple of 8.
(Padding is equivalent to a negative value of `count` in
`numpy.unpackbits <https://numpy.org/doc/stable/reference/generated/numpy.unpackbits.html>`_)
"""
self.data = data
self.dtype = dtype
Expand Down Expand Up @@ -424,10 +441,20 @@ def from_vector(
) -> Binary:
...

@classmethod
@overload
def from_vector(
cls: Type[Binary],
vector: npt.NDArray[np.number],
dtype: BinaryVectorDtype,
padding: int = 0,
) -> Binary:
...

@classmethod
def from_vector(
cls: Type[Binary],
vector: Union[BinaryVector, list[int], list[float]],
vector: Union[BinaryVector, list[int], list[float], npt.NDArray[np.number]],
dtype: Optional[BinaryVectorDtype] = None,
padding: Optional[int] = None,
) -> Binary:
Expand Down Expand Up @@ -459,25 +486,30 @@ def from_vector(
vector = vector.data # type: ignore

padding = 0 if padding is None else padding
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
if 0 <= padding > 7:
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
assert isinstance(dtype, BinaryVectorDtype)
metadata = struct.pack("<sB", dtype.value, padding)

if _NUMPY_AVAILABLE and isinstance(vector, np.ndarray):
data = _numpy_vector_to_bytes(vector, dtype)
else:
raise NotImplementedError("%s not yet supported" % dtype)
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
if 0 <= padding > 7:
raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
if padding and not vector:
raise ValueError("Empty vector with non-zero padding.")
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
else:
raise NotImplementedError("%s not yet supported" % dtype)
data = struct.pack(f"<{len(vector)}{format_str}", *vector) # type: ignore

metadata = struct.pack("<sB", dtype.value, padding)
data = struct.pack(f"<{len(vector)}{format_str}", *vector) # type: ignore
if padding and len(vector) and not (data[-1] & ((1 << padding) - 1)) == 0:
raise ValueError(
"Vector has a padding P, but bits in the final byte lower than P are non-zero. They must be zero."
Expand Down Expand Up @@ -549,6 +581,33 @@ def subtype(self) -> int:
"""Subtype of this binary data."""
return self.__subtype

def as_numpy_vector(self) -> BinaryVector:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a new method or a new argument to the existing as_vector method? Like binary.as_vector(numpy=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm open to this as alternative to additional function as_numpy_vector. What do you think, @blink1073 ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like Shane's suggestion, for symmetry with from_vector.

"""From the Binary, create a BinaryVector where data is a 1-dim numpy array.
dtype still follows our typing (BinaryVectorDtype),
and padding is as we define it, notably equivalent to a negative value of count
in `numpy.unpackbits <https://numpy.org/doc/stable/reference/generated/numpy.unpackbits.html>`_.

:return: BinaryVector

.. versionadded:: 4.16
"""
if self.subtype != VECTOR_SUBTYPE:
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector")
if not _NUMPY_AVAILABLE:
raise ImportError("Converting binary to numpy.ndarray requires numpy to be installed.")
dtype, padding = struct.unpack_from("<sB", self, 0)
dtype = BinaryVectorDtype(dtype)

if dtype == BinaryVectorDtype.INT8:
data = np.frombuffer(self[2:], dtype="int8")
elif dtype == BinaryVectorDtype.FLOAT32:
data = np.frombuffer(self[2:], dtype="float32")
elif dtype == BinaryVectorDtype.PACKED_BIT:
data = np.frombuffer(self[2:], dtype="uint8")
Copy link
Member

@ShaneHarvey ShaneHarvey Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we following the rules of the spec for validating PACKED_BIT here (eg the validation in as_vector)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just added the same validations applied in as_vector.

else:
raise ValueError(f"Unsupported dtype code: {dtype!r}")
return BinaryVector(data, dtype, padding)

def __getnewargs__(self) -> Tuple[bytes, int]: # type: ignore[override]
# Work around http://bugs.python.org/issue7382
data = super().__getnewargs__()[0]
Expand All @@ -575,3 +634,32 @@ def __repr__(self) -> str:
return f"<Binary(REDACTED, {self.__subtype})>"
else:
return f"Binary({bytes.__repr__(self)}, {self.__subtype})"


def _numpy_vector_to_bytes(
vector: npt.NDArray[np.number],
dtype: BinaryVectorDtype,
) -> bytes:
if not _NUMPY_AVAILABLE:
raise ImportError("Converting numpy.ndarray to binary requires numpy to be installed.")

assert isinstance(vector, np.ndarray)
assert (
vector.ndim == 1
), "from_numpy_vector only supports 1D arrays as it creates a single vector."

if dtype == BinaryVectorDtype.FLOAT32:
vector = vector.astype(np.dtype("float32"), copy=False)
elif dtype == BinaryVectorDtype.INT8:
if vector.min() >= -128 and vector.max() <= 127:
vector = vector.astype(np.dtype("int8"), copy=False)
else:
raise ValueError("Values found outside INT8 range.")
elif dtype == BinaryVectorDtype.PACKED_BIT:
if vector.min() >= 0 and vector.max() <= 127:
vector = vector.astype(np.dtype("uint8"), copy=False)
else:
raise ValueError("Values found outside UINT8 range.")
else:
raise NotImplementedError("%s not yet supported" % dtype)
return vector.tobytes()
1 change: 1 addition & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ PyMongo 4.16 brings a number of changes including:
Python 3.10+. The minimum version is ``2.6.1`` to account for `CVE-2023-29483 <https://www.cve.org/CVERecord?id=CVE-2023-29483>`_.
- Removed support for Eventlet.
Eventlet is actively being sunset by its maintainers and has compatibility issues with PyMongo's dnspython dependency.
- Added support for NumPy 1D-arrays in BSON Binary Vectors.

Changes in Version 4.15.3 (2025/10/07)
--------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set shell := ["bash", "-c"]

# Commonly used command segments.
typing_run := "uv run --group typing --extra aws --extra encryption --extra ocsp --extra snappy --extra test --extra zstd"
typing_run := "uv run --group typing --extra aws --extra encryption --extra numpy --extra ocsp --extra snappy --extra test --extra zstd"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of adding a new extra we could use --with numpy.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a great suggestion. I had to change invocations from uv run mypy . (or pytest) to be done ad modules uv run python -m mypy . but it works. I removed the extra from pyproject and requirements/.

docs_run := "uv run --extra docs"
doc_build := "./doc/_build"
mypy_args := "--install-types --non-interactive"
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ ocsp = ["requirements/ocsp.txt"]
snappy = ["requirements/snappy.txt"]
test = ["requirements/test.txt"]
zstd = ["requirements/zstd.txt"]
numpy = ["requirements/numpy.txt"]

[tool.pytest.ini_options]
minversion = "7"
Expand Down
1 change: 1 addition & 0 deletions requirements/numpy.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
numpy>=1.21
62 changes: 62 additions & 0 deletions test/test_bson.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@
from bson.timestamp import Timestamp
from bson.tz_util import FixedOffset, utc

_NUMPY_AVAILABLE = False
try:
import numpy as np

_NUMPY_AVAILABLE = True
except ImportError:
np = None # type: ignore


class NotADict(abc.MutableMapping):
"""Non-dict type that implements the mapping protocol."""
Expand Down Expand Up @@ -871,6 +879,60 @@ def test_binaryvector_equality(self):
BinaryVector([1], BinaryVectorDtype.INT8), BinaryVector([2], BinaryVectorDtype.INT8)
)

@unittest.skipIf(not _NUMPY_AVAILABLE, "numpy optional-dependency not installed.")
def test_vector_from_numpy(self):
"""Follows test_vector except for input type numpy.ndarray"""
# Simple data values could be treated as any of our BinaryVectorDtypes
arr = np.array([2, 3])
# INT8
binary_vector_int8 = Binary.from_vector(arr, BinaryVectorDtype.INT8)
# as_vector
vector = binary_vector_int8.as_vector()
assert isinstance(vector, BinaryVector)
assert vector.data == arr.tolist()
# as_numpy_vector
vector_np = binary_vector_int8.as_numpy_vector()
assert isinstance(vector_np, BinaryVector)
assert np.all(vector.data == arr)
# PACKED_BIT
binary_vector_uint8 = Binary.from_vector(arr, BinaryVectorDtype.PACKED_BIT)
# as_vector
vector = binary_vector_uint8.as_vector()
assert isinstance(vector, BinaryVector)
assert vector.data == arr.tolist()
# as_numpy_vector
vector_np = binary_vector_uint8.as_numpy_vector()
assert isinstance(vector_np, BinaryVector)
assert np.all(vector_np.data == arr)
# FLOAT32
binary_vector_float32 = Binary.from_vector(arr, BinaryVectorDtype.FLOAT32)
# as_vector
vector = binary_vector_float32.as_vector()
assert isinstance(vector, BinaryVector)
assert vector.data == arr.tolist()
# as_numpy_vector
vector_np = binary_vector_float32.as_numpy_vector()
assert isinstance(vector_np, BinaryVector)
assert np.all(vector_np.data == arr)

# Invalid cases
with self.assertRaises(ValueError):
Binary.from_vector(np.array([-1]), BinaryVectorDtype.PACKED_BIT)
with self.assertRaises(ValueError):
Binary.from_vector(np.array([128]), BinaryVectorDtype.PACKED_BIT)
with self.assertRaises(ValueError):
Binary.from_vector(np.array([-198]), BinaryVectorDtype.INT8)

# Unexpected cases
# Creating a vector of INT8 from a list of doubles will be caught by struct.pack
# Numpy's default behavior is to cast to the type requested.
list_floats = [-1.1, 1.1]
cast_bin = Binary.from_vector(np.array(list_floats), BinaryVectorDtype.INT8)
vector = cast_bin.as_vector()
vector_np = cast_bin.as_numpy_vector()
assert vector.data != list_floats
assert vector.data == vector_np.data.tolist() == [-1, 1]

def test_unicode_regex(self):
"""Tests we do not get a segfault for C extension on unicode RegExs.
This had been happening.
Expand Down
Loading
Loading