mongodb · caseyclements · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
@@ -66,6 +66,16 @@
     from mmap import mmap as _mmap
 
 
+_NUMPY_AVAILABLE = False
+try:
+    import numpy as np
+    import numpy.typing as npt
+
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    np = None  # type: ignore
+
+
 class UuidRepresentation:
     UNSPECIFIED = 0
     """An unspecified UUID representation.
@@ -234,13 +244,20 @@ class BinaryVector:
 
     __slots__ = ("data", "dtype", "padding")
 
-    def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
+    def __init__(
+        self,
+        data: Union[Sequence[float | int], npt.NDArray[np.number]],
+        dtype: BinaryVectorDtype,
+        padding: int = 0,
+    ):
         """
         :param data: Sequence of numbers representing the mathematical vector.
         :param dtype:  The data type stored in binary
         :param padding: The number of bits in the final byte that are to be ignored
           when a vector element's size is less than a byte
           and the length of the vector is not a multiple of 8.
+          (Padding is equivalent to a negative value of `count` in
+          `numpy.unpackbits <https://numpy.org/doc/stable/reference/generated/numpy.unpackbits.html>`_)
         """
         self.data = data
         self.dtype = dtype
@@ -424,10 +441,20 @@ def from_vector(
     ) -> Binary:
         ...
 
+    @classmethod
+    @overload
+    def from_vector(
+        cls: Type[Binary],
+        vector: npt.NDArray[np.number],
+        dtype: BinaryVectorDtype,
+        padding: int = 0,
+    ) -> Binary:
+        ...
+
     @classmethod
     def from_vector(
         cls: Type[Binary],
-        vector: Union[BinaryVector, list[int], list[float]],
+        vector: Union[BinaryVector, list[int], list[float], npt.NDArray[np.number]],
         dtype: Optional[BinaryVectorDtype] = None,
         padding: Optional[int] = None,
     ) -> Binary:
@@ -459,25 +486,30 @@ def from_vector(
             vector = vector.data  # type: ignore
 
         padding = 0 if padding is None else padding
-        if dtype == BinaryVectorDtype.INT8:  # pack ints in [-128, 127] as signed int8
-            format_str = "b"
-            if padding:
-                raise ValueError(f"padding does not apply to {dtype=}")
-        elif dtype == BinaryVectorDtype.PACKED_BIT:  # pack ints in [0, 255] as unsigned uint8
-            format_str = "B"
-            if 0 <= padding > 7:
-                raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
-            if padding and not vector:
-                raise ValueError("Empty vector with non-zero padding.")
-        elif dtype == BinaryVectorDtype.FLOAT32:  # pack floats as float32
-            format_str = "f"
-            if padding:
-                raise ValueError(f"padding does not apply to {dtype=}")
+        assert isinstance(dtype, BinaryVectorDtype)
+        metadata = struct.pack("<sB", dtype.value, padding)
+
+        if _NUMPY_AVAILABLE and isinstance(vector, np.ndarray):
+            data = _numpy_vector_to_bytes(vector, dtype)
         else:
-            raise NotImplementedError("%s not yet supported" % dtype)
+            if dtype == BinaryVectorDtype.INT8:  # pack ints in [-128, 127] as signed int8
+                format_str = "b"
+                if padding:
+                    raise ValueError(f"padding does not apply to {dtype=}")
+            elif dtype == BinaryVectorDtype.PACKED_BIT:  # pack ints in [0, 255] as unsigned uint8
+                format_str = "B"
+                if 0 <= padding > 7:
+                    raise ValueError(f"{padding=}. It must be in [0,1, ..7].")
+                if padding and not vector:
+                    raise ValueError("Empty vector with non-zero padding.")
+            elif dtype == BinaryVectorDtype.FLOAT32:  # pack floats as float32
+                format_str = "f"
+                if padding:
+                    raise ValueError(f"padding does not apply to {dtype=}")
+            else:
+                raise NotImplementedError("%s not yet supported" % dtype)
+            data = struct.pack(f"<{len(vector)}{format_str}", *vector)  # type: ignore
 
-        metadata = struct.pack("<sB", dtype.value, padding)
-        data = struct.pack(f"<{len(vector)}{format_str}", *vector)  # type: ignore
         if padding and len(vector) and not (data[-1] & ((1 << padding) - 1)) == 0:
             raise ValueError(
                 "Vector has a padding P, but bits in the final byte lower than P are non-zero. They must be zero."
@@ -549,6 +581,33 @@ def subtype(self) -> int:
         """Subtype of this binary data."""
         return self.__subtype
 
+    def as_numpy_vector(self) -> BinaryVector:
+        """From the Binary, create a BinaryVector where data is a 1-dim numpy array.
+        dtype still follows our typing (BinaryVectorDtype),
+        and padding is as we define it, notably equivalent to a negative value of count
+        in `numpy.unpackbits <https://numpy.org/doc/stable/reference/generated/numpy.unpackbits.html>`_.
+
+        :return: BinaryVector
+
+        .. versionadded:: 4.16
+        """
+        if self.subtype != VECTOR_SUBTYPE:
+            raise ValueError(f"Cannot decode subtype {self.subtype} as a vector")
+        if not _NUMPY_AVAILABLE:
+            raise ImportError("Converting binary to numpy.ndarray requires numpy to be installed.")
+        dtype, padding = struct.unpack_from("<sB", self, 0)
+        dtype = BinaryVectorDtype(dtype)
+
+        if dtype == BinaryVectorDtype.INT8:
+            data = np.frombuffer(self[2:], dtype="int8")
+        elif dtype == BinaryVectorDtype.FLOAT32:
+            data = np.frombuffer(self[2:], dtype="float32")
+        elif dtype == BinaryVectorDtype.PACKED_BIT:
+            data = np.frombuffer(self[2:], dtype="uint8")
+        else:
+            raise ValueError(f"Unsupported dtype code: {dtype!r}")
+        return BinaryVector(data, dtype, padding)
+
     def __getnewargs__(self) -> Tuple[bytes, int]:  # type: ignore[override]
         # Work around http://bugs.python.org/issue7382
         data = super().__getnewargs__()[0]
@@ -575,3 +634,32 @@ def __repr__(self) -> str:
             return f"<Binary(REDACTED, {self.__subtype})>"
         else:
             return f"Binary({bytes.__repr__(self)}, {self.__subtype})"
+
+
+def _numpy_vector_to_bytes(
+    vector: npt.NDArray[np.number],
+    dtype: BinaryVectorDtype,
+) -> bytes:
+    if not _NUMPY_AVAILABLE:
+        raise ImportError("Converting numpy.ndarray to binary requires numpy to be installed.")
+
+    assert isinstance(vector, np.ndarray)
+    assert (
+        vector.ndim == 1
+    ), "from_numpy_vector only supports 1D arrays as it creates a single vector."
+
+    if dtype == BinaryVectorDtype.FLOAT32:
+        vector = vector.astype(np.dtype("float32"), copy=False)
+    elif dtype == BinaryVectorDtype.INT8:
+        if vector.min() >= -128 and vector.max() <= 127:
+            vector = vector.astype(np.dtype("int8"), copy=False)
+        else:
+            raise ValueError("Values found outside INT8 range.")
+    elif dtype == BinaryVectorDtype.PACKED_BIT:
+        if vector.min() >= 0 and vector.max() <= 127:
+            vector = vector.astype(np.dtype("uint8"), copy=False)
+        else:
+            raise ValueError("Values found outside UINT8 range.")
+    else:
+        raise NotImplementedError("%s not yet supported" % dtype)
+    return vector.tobytes()
@@ -16,6 +16,7 @@ PyMongo 4.16 brings a number of changes including:
   Python 3.10+.  The minimum version is ``2.6.1`` to account for `CVE-2023-29483 <https://www.cve.org/CVERecord?id=CVE-2023-29483>`_.
 - Removed support for Eventlet.
   Eventlet is actively being sunset by its maintainers and has compatibility issues with PyMongo's dnspython dependency.
+- Added support for NumPy 1D-arrays in BSON Binary Vectors.
 
 Changes in Version 4.15.3 (2025/10/07)
 --------------------------------------

@@ -2,7 +2,7 @@
 set shell := ["bash", "-c"]
 
 # Commonly used command segments.
-typing_run := "uv run --group typing --extra aws --extra encryption --extra ocsp --extra snappy --extra test --extra zstd"
+typing_run := "uv run --group typing --extra aws --extra encryption --extra numpy --extra ocsp --extra snappy --extra test --extra zstd"
 docs_run := "uv run --extra docs"
 doc_build := "./doc/_build"
 mypy_args := "--install-types --non-interactive"

diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,7 @@ ocsp = ["requirements/ocsp.txt"]
 snappy = ["requirements/snappy.txt"]
 test = ["requirements/test.txt"]
 zstd = ["requirements/zstd.txt"]
+numpy = ["requirements/numpy.txt"]
 
 [tool.pytest.ini_options]
 minversion = "7"

diff --git a/requirements/numpy.txt b/requirements/numpy.txt
@@ -0,0 +1 @@
+numpy>=1.21
@@ -71,6 +71,14 @@
 from bson.timestamp import Timestamp
 from bson.tz_util import FixedOffset, utc
 
+_NUMPY_AVAILABLE = False
+try:
+    import numpy as np
+
+    _NUMPY_AVAILABLE = True
+except ImportError:
+    np = None  # type: ignore
+
 
 class NotADict(abc.MutableMapping):
     """Non-dict type that implements the mapping protocol."""
@@ -871,6 +879,60 @@ def test_binaryvector_equality(self):
             BinaryVector([1], BinaryVectorDtype.INT8), BinaryVector([2], BinaryVectorDtype.INT8)
         )
 
+    @unittest.skipIf(not _NUMPY_AVAILABLE, "numpy optional-dependency not installed.")
+    def test_vector_from_numpy(self):
+        """Follows test_vector except for input type numpy.ndarray"""
+        # Simple data values could be treated as any of our BinaryVectorDtypes
+        arr = np.array([2, 3])
+        # INT8
+        binary_vector_int8 = Binary.from_vector(arr, BinaryVectorDtype.INT8)
+        # as_vector
+        vector = binary_vector_int8.as_vector()
+        assert isinstance(vector, BinaryVector)
+        assert vector.data == arr.tolist()
+        # as_numpy_vector
+        vector_np = binary_vector_int8.as_numpy_vector()
+        assert isinstance(vector_np, BinaryVector)
+        assert np.all(vector.data == arr)
+        # PACKED_BIT
+        binary_vector_uint8 = Binary.from_vector(arr, BinaryVectorDtype.PACKED_BIT)
+        # as_vector
+        vector = binary_vector_uint8.as_vector()
+        assert isinstance(vector, BinaryVector)
+        assert vector.data == arr.tolist()
+        # as_numpy_vector
+        vector_np = binary_vector_uint8.as_numpy_vector()
+        assert isinstance(vector_np, BinaryVector)
+        assert np.all(vector_np.data == arr)
+        # FLOAT32
+        binary_vector_float32 = Binary.from_vector(arr, BinaryVectorDtype.FLOAT32)
+        # as_vector
+        vector = binary_vector_float32.as_vector()
+        assert isinstance(vector, BinaryVector)
+        assert vector.data == arr.tolist()
+        # as_numpy_vector
+        vector_np = binary_vector_float32.as_numpy_vector()
+        assert isinstance(vector_np, BinaryVector)
+        assert np.all(vector_np.data == arr)
+
+        # Invalid cases
+        with self.assertRaises(ValueError):
+            Binary.from_vector(np.array([-1]), BinaryVectorDtype.PACKED_BIT)
+        with self.assertRaises(ValueError):
+            Binary.from_vector(np.array([128]), BinaryVectorDtype.PACKED_BIT)
+        with self.assertRaises(ValueError):
+            Binary.from_vector(np.array([-198]), BinaryVectorDtype.INT8)
+
+        # Unexpected cases
+        # Creating a vector of INT8 from a list of doubles will be caught by struct.pack
+        # Numpy's default behavior is to cast to the type requested.
+        list_floats = [-1.1, 1.1]
+        cast_bin = Binary.from_vector(np.array(list_floats), BinaryVectorDtype.INT8)
+        vector = cast_bin.as_vector()
+        vector_np = cast_bin.as_numpy_vector()
+        assert vector.data != list_floats
+        assert vector.data == vector_np.data.tolist() == [-1, 1]
+
     def test_unicode_regex(self):
         """Tests we do not get a segfault for C extension on unicode RegExs.
         This had been happening.