Skip to content

Commit fa51d84

Browse files
committed
expose Hasher::update_mmap and Hasher::update_mmap_rayon as blake3.update_mmap
1 parent 9975120 commit fa51d84

File tree

5 files changed

+130
-5
lines changed

5 files changed

+130
-5
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ crate-type = ["cdylib"]
2121
neon = ["blake3/neon"]
2222

2323
[dependencies]
24-
blake3 = { version = "1.0.0", features = ["rayon"] }
24+
blake3 = { version = "1.5", features = ["mmap", "rayon"] }
2525
hex = "0.4.2"
2626
pyo3 = { version = "0.20.0", features = ["extension-module"] }
2727
rayon = "1.2.1"

blake3.pyi

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from os import PathLike
2+
13
__version__: str = ...
24

35
class blake3:
@@ -19,6 +21,7 @@ class blake3:
1921
): ...
2022
# TODO: use collections.abc.Buffer here when PEP 688 lands in Python 3.12
2123
def update(self, data: bytes, /) -> None: ...
24+
def update_mmap(self, path: str | PathLike[str]) -> None: ...
2225
def copy(self) -> blake3: ...
2326
def reset(self) -> None: ...
2427
def digest(self, length: int = ..., *, seek: int = ...) -> bytes: ...

c_impl/blake3module.c

+70-4
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
#include <Python.h>
33

44
#include <stdbool.h>
5+
#include <stdio.h>
56

67
#include "blake3.h"
78

89
#define AUTO -1
910

11+
#define BUFSIZE 65536
12+
1013
// CPython defines HASHLIB_GIL_MINSIZE in hashlib.h. We'll want to remove this
1114
// definition if this code is added to CPython.
1215
#ifdef HASHLIB_GIL_MINSIZE
@@ -220,6 +223,67 @@ static PyObject *Blake3_update(Blake3Object *self, PyObject *args) {
220223
return ret;
221224
}
222225

226+
// This implementation doesn't actually use mmap; it just falls back to regular
227+
// file reading. This mainly exists for compatibility with the Rust
228+
// implementation's Python test suite.
229+
// TODO: actually mmap
230+
static PyObject *Blake3_update_mmap(Blake3Object *self, PyObject *args,
231+
PyObject *kwds) {
232+
PyBytesObject *path_bytes = NULL;
233+
FILE *file = NULL;
234+
PyObject *ret = NULL;
235+
236+
static char *kwlist[] = {
237+
"path",
238+
NULL,
239+
};
240+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&", kwlist,
241+
PyUnicode_FSConverter, &path_bytes)) {
242+
return NULL;
243+
}
244+
245+
PyThreadState *thread_state;
246+
Blake3_release_gil_and_lock_self(self, &thread_state);
247+
248+
file = fopen(PyBytes_AS_STRING(path_bytes), "r");
249+
if (!file) {
250+
PyErr_SetFromErrno(PyExc_OSError);
251+
goto exit;
252+
}
253+
254+
char *buf[BUFSIZE];
255+
while (1) {
256+
size_t n = fread(buf, sizeof(char), BUFSIZE, file);
257+
if (ferror(file)) {
258+
PyErr_SetFromErrno(PyExc_OSError);
259+
goto exit;
260+
}
261+
blake3_hasher_update(&self->hasher, buf, n);
262+
if (feof(file)) {
263+
break;
264+
}
265+
}
266+
267+
int fclose_ret = fclose(file);
268+
file = NULL;
269+
if (fclose_ret != 0) {
270+
PyErr_SetFromErrno(PyExc_OSError);
271+
goto exit;
272+
}
273+
274+
// success
275+
Py_INCREF(Py_None);
276+
ret = Py_None;
277+
278+
exit:
279+
if (file) {
280+
fclose(file);
281+
}
282+
Blake3_unlock_self_and_acquire_gil(self, &thread_state);
283+
Py_XDECREF(path_bytes);
284+
return ret;
285+
}
286+
223287
static PyObject *Blake3_digest(Blake3Object *self, PyObject *args,
224288
PyObject *kwds) {
225289
static char *kwlist[] = {
@@ -279,11 +343,13 @@ static PyObject *Blake3_reset(Blake3Object *self, PyObject *args) {
279343

280344
static PyMethodDef Blake3_methods[] = {
281345
{"update", (PyCFunction)Blake3_update, METH_VARARGS, "add input bytes"},
282-
{"digest", (PyCFunction)Blake3_digest, METH_VARARGS | METH_KEYWORDS,
283-
"finalize the hash"},
284-
{"hexdigest", (PyCFunction)Blake3_hexdigest, METH_VARARGS | METH_KEYWORDS,
346+
{"update_mmap", (PyCFunctionWithKeywords)Blake3_update_mmap,
347+
METH_VARARGS | METH_KEYWORDS, "add input bytes from a filepath"},
348+
{"digest", (PyCFunctionWithKeywords)Blake3_digest,
349+
METH_VARARGS | METH_KEYWORDS, "finalize the hash"},
350+
{"hexdigest", (PyCFunctionWithKeywords)Blake3_hexdigest,
351+
METH_VARARGS | METH_KEYWORDS,
285352
"finalize the hash and encode the result as hex"},
286-
{"update", (PyCFunction)Blake3_update, METH_VARARGS, "add input bytes"},
287353
{"copy", (PyCFunction)Blake3_copy, METH_VARARGS,
288354
"make a copy of this hasher"},
289355
{"reset", (PyCFunction)Blake3_reset, METH_VARARGS,

src/lib.rs

+28
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use pyo3::buffer::PyBuffer;
44
use pyo3::exceptions::{PyBufferError, PyOverflowError, PyValueError};
55
use pyo3::prelude::*;
66
use pyo3::types::{PyAny, PyBytes, PyString};
7+
use std::path::PathBuf;
78
use std::sync::Mutex;
89

910
// This is the same as HASHLIB_GIL_MINSIZE in CPython.
@@ -327,6 +328,33 @@ impl Blake3Class {
327328
Ok(())
328329
}
329330

331+
/// Read a file using memory mapping and add its bytes to the hasher. You can call this any
332+
/// number of times.
333+
///
334+
/// Arguments:
335+
/// - `path` (required): The filepath to read.
336+
#[pyo3(signature=(path))]
337+
fn update_mmap(&mut self, py: Python, path: PathBuf) -> PyResult<()> {
338+
py.allow_threads(|| -> PyResult<()> {
339+
match &mut self.threading_mode {
340+
ThreadingMode::Single => {
341+
self.rust_hasher.lock().unwrap().update_mmap(&path)?;
342+
}
343+
ThreadingMode::Auto => {
344+
self.rust_hasher.lock().unwrap().update_mmap_rayon(&path)?;
345+
}
346+
ThreadingMode::Pool { pool, .. } => {
347+
pool.install(|| -> PyResult<()> {
348+
self.rust_hasher.lock().unwrap().update_mmap_rayon(&path)?;
349+
Ok(())
350+
})?;
351+
}
352+
}
353+
Ok(())
354+
})?;
355+
Ok(())
356+
}
357+
330358
/// Return a copy (“clone”) of the hasher. This can be used to
331359
/// efficiently compute the digests of data sharing a common initial
332360
/// substring.

tests/test_blake3.py

+28
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
from binascii import unhexlify
33
import json
44
import numpy
5+
import os
56
from pathlib import Path
67
import subprocess
78
import sys
9+
import tempfile
810
from typing import Any
911

1012
from blake3 import blake3, __version__
@@ -428,3 +430,29 @@ def test_module_name() -> None:
428430
global_scope: dict[str, Any] = {}
429431
exec(f"from {blake3.__module__} import blake3 as foobar", global_scope)
430432
assert global_scope["foobar"] is blake3
433+
434+
435+
def test_mmap() -> None:
436+
input_bytes = bytes([42]) * 1_000_000
437+
# Note that we can't use NamedTemporaryFile here, because we can't open it
438+
# again on Windows.
439+
(fd, temp_path) = tempfile.mkstemp()
440+
os.close(fd)
441+
with open(temp_path, "wb") as f:
442+
f.write(input_bytes)
443+
444+
# Test all three threading modes, and both str and Path arguments. Note
445+
# that PyO3 doesn't support converting Python bytes to a Rust PathBuf,
446+
# I think because that's not generally possible on Windows.
447+
hasher1 = blake3()
448+
hasher1.update_mmap(temp_path)
449+
assert blake3(input_bytes).digest() == hasher1.digest()
450+
451+
hasher2 = blake3(max_threads=blake3.AUTO)
452+
hasher2.update_mmap(Path(temp_path))
453+
assert blake3(input_bytes).digest() == hasher2.digest()
454+
455+
hasher3 = blake3(max_threads=4)
456+
hasher3.update_mmap(temp_path)
457+
hasher3.update_mmap(path=Path(temp_path))
458+
assert blake3(2 * input_bytes).digest() == hasher3.digest()

0 commit comments

Comments
 (0)