Skip to content

Commit

Permalink
expose Hasher::update_mmap and Hasher::update_mmap_rayon as blake3.up…
Browse files Browse the repository at this point in the history
…date_mmap
  • Loading branch information
oconnor663 committed Feb 1, 2024
1 parent 9975120 commit 51a03f4
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ crate-type = ["cdylib"]
neon = ["blake3/neon"]

[dependencies]
blake3 = { version = "1.0.0", features = ["rayon"] }
blake3 = { version = "1.5", features = ["mmap", "rayon"] }
hex = "0.4.2"
pyo3 = { version = "0.20.0", features = ["extension-module"] }
rayon = "1.2.1"
3 changes: 3 additions & 0 deletions blake3.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from os import PathLike

__version__: str = ...

class blake3:
Expand All @@ -19,6 +21,7 @@ class blake3:
): ...
# TODO: use collections.abc.Buffer here when PEP 688 lands in Python 3.12
def update(self, data: bytes, /) -> None: ...
def update_mmap(self, path: str | PathLike[str]) -> None: ...
def copy(self) -> blake3: ...
def reset(self) -> None: ...
def digest(self, length: int = ..., *, seek: int = ...) -> bytes: ...
Expand Down
74 changes: 70 additions & 4 deletions c_impl/blake3module.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
#include <Python.h>

#include <stdbool.h>
#include <stdio.h>

#include "blake3.h"

#define AUTO -1

#define BUFSIZE 65536

// CPython defines HASHLIB_GIL_MINSIZE in hashlib.h. We'll want to remove this
// definition if this code is added to CPython.
#ifdef HASHLIB_GIL_MINSIZE
Expand Down Expand Up @@ -220,6 +223,67 @@ static PyObject *Blake3_update(Blake3Object *self, PyObject *args) {
return ret;
}

// This implementation doesn't actually use mmap; it just falls back to regular
// file reading. This mainly exists for compatibility with the Rust
// implementation's Python test suite.
// TODO: actually mmap
static PyObject *Blake3_update_mmap(Blake3Object *self, PyObject *args,
PyObject *kwds) {
PyBytesObject *path_bytes = NULL;
FILE *file = NULL;
PyObject *ret = NULL;

static char *kwlist[] = {
"path",
NULL,
};
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&", kwlist,
PyUnicode_FSConverter, &path_bytes)) {
return NULL;
}

PyThreadState *thread_state;
Blake3_release_gil_and_lock_self(self, &thread_state);

file = fopen(PyBytes_AS_STRING(path_bytes), "r");
if (!file) {
PyErr_SetFromErrno(PyExc_OSError);
goto exit;
}

char *buf[BUFSIZE];
while (1) {
size_t n = fread(buf, sizeof(char), BUFSIZE, file);
if (ferror(file)) {
PyErr_SetFromErrno(PyExc_OSError);
goto exit;
}
blake3_hasher_update(&self->hasher, buf, n);
if (feof(file)) {
break;
}
}

int fclose_ret = fclose(file);
file = NULL;
if (fclose_ret != 0) {
PyErr_SetFromErrno(PyExc_OSError);
goto exit;
}

// success
Py_INCREF(Py_None);
ret = Py_None;

exit:
if (file) {
fclose(file);
}
Blake3_unlock_self_and_acquire_gil(self, &thread_state);
Py_XDECREF(path_bytes);
return ret;
}

static PyObject *Blake3_digest(Blake3Object *self, PyObject *args,
PyObject *kwds) {
static char *kwlist[] = {
Expand Down Expand Up @@ -279,11 +343,13 @@ static PyObject *Blake3_reset(Blake3Object *self, PyObject *args) {

static PyMethodDef Blake3_methods[] = {
{"update", (PyCFunction)Blake3_update, METH_VARARGS, "add input bytes"},
{"digest", (PyCFunction)Blake3_digest, METH_VARARGS | METH_KEYWORDS,
"finalize the hash"},
{"hexdigest", (PyCFunction)Blake3_hexdigest, METH_VARARGS | METH_KEYWORDS,
{"update_mmap", (PyCFunctionWithKeywords)Blake3_update_mmap,
METH_VARARGS | METH_KEYWORDS, "add input bytes from a filepath"},
{"digest", (PyCFunctionWithKeywords)Blake3_digest,
METH_VARARGS | METH_KEYWORDS, "finalize the hash"},
{"hexdigest", (PyCFunctionWithKeywords)Blake3_hexdigest,
METH_VARARGS | METH_KEYWORDS,
"finalize the hash and encode the result as hex"},
{"update", (PyCFunction)Blake3_update, METH_VARARGS, "add input bytes"},
{"copy", (PyCFunction)Blake3_copy, METH_VARARGS,
"make a copy of this hasher"},
{"reset", (PyCFunction)Blake3_reset, METH_VARARGS,
Expand Down
28 changes: 28 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use pyo3::buffer::PyBuffer;
use pyo3::exceptions::{PyBufferError, PyOverflowError, PyValueError};
use pyo3::prelude::*;
use pyo3::types::{PyAny, PyBytes, PyString};
use std::path::PathBuf;
use std::sync::Mutex;

// This is the same as HASHLIB_GIL_MINSIZE in CPython.
Expand Down Expand Up @@ -327,6 +328,33 @@ impl Blake3Class {
Ok(())
}

/// Read a file using memory mapping and add its bytes to the hasher. You can call this any
/// number of times.
///
/// Arguments:
/// - `path` (required): The filepath to read.
#[pyo3(signature=(path))]
fn update_mmap(&mut self, py: Python, path: PathBuf) -> PyResult<()> {
py.allow_threads(|| -> PyResult<()> {
match &mut self.threading_mode {
ThreadingMode::Single => {
self.rust_hasher.lock().unwrap().update_mmap(&path)?;
}
ThreadingMode::Auto => {
self.rust_hasher.lock().unwrap().update_mmap_rayon(&path)?;
}
ThreadingMode::Pool { pool, .. } => {
pool.install(|| -> PyResult<()> {
self.rust_hasher.lock().unwrap().update_mmap_rayon(&path)?;
Ok(())
})?;
}
}
Ok(())
})?;
Ok(())
}

/// Return a copy (“clone”) of the hasher. This can be used to
/// efficiently compute the digests of data sharing a common initial
/// substring.
Expand Down
24 changes: 24 additions & 0 deletions tests/test_blake3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
import subprocess
import sys
import tempfile
from typing import Any

from blake3 import blake3, __version__
Expand Down Expand Up @@ -428,3 +429,26 @@ def test_module_name() -> None:
global_scope: dict[str, Any] = {}
exec(f"from {blake3.__module__} import blake3 as foobar", global_scope)
assert global_scope["foobar"] is blake3


def test_mmap() -> None:
input_bytes = bytes([42]) * 1_000_000
with tempfile.NamedTemporaryFile() as f:
f.write(input_bytes)
f.flush()

# Test all three threading modes, and both str and Path arguments. Note
# that PyO3 doesn't support converting Python bytes to a Rust PathBuf,
# I think because that's not generally possible on Windows.
hasher1 = blake3()
hasher1.update_mmap(f.name)
assert blake3(input_bytes).digest() == hasher1.digest()

hasher2 = blake3(max_threads=blake3.AUTO)
hasher2.update_mmap(Path(f.name))
assert blake3(input_bytes).digest() == hasher2.digest()

hasher3 = blake3(max_threads=4)
hasher3.update_mmap(path=f.name)
hasher3.update_mmap(path=Path(f.name))
assert blake3(2 * input_bytes).digest() == hasher3.digest()

0 comments on commit 51a03f4

Please sign in to comment.