Skip to content

Commit

Permalink
add optinal params and docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
abyesilyurt committed Oct 15, 2024
1 parent 9046af9 commit 484cfc7
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 8 deletions.
17 changes: 17 additions & 0 deletions py_fast_rsync/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,18 @@
"""
A faster implementation of librsync in pure Rust, wrapped for Python.
This module offers three major APIs:
1. `signature.calculate()`, which takes a block of data and returns a
"signature" of that data which is much smaller than the original data.
2. `diff()`, which takes a signature for some block A, and a block of data B, and
returns a delta between block A and block B. If A and B are "similar", then
the delta is usually much smaller than block B.
3. `apply()`, which takes a block A and a delta (as constructed by `diff()`), and
(usually) returns the block B.
This Python module wraps the Rust implementation, providing a high-performance
solution for efficient data synchronization and comparison.
"""

from .py_fast_rsync import *
43 changes: 41 additions & 2 deletions py_fast_rsync/py_fast_rsync.pyi
Original file line number Diff line number Diff line change
@@ -1,2 +1,41 @@
def diff(signature_bytes: bytes, data: bytes) -> bytes: ...
def apply(base: bytes, delta: bytes) -> bytes: ...
def diff(signature_bytes: bytes, data: bytes) -> bytes:
"""
Calculate a delta and return it as bytes.
This function computes a delta that can be applied to the base data represented by `signature_bytes`
to attempt to reconstruct `data`.
Args:
signature_bytes (bytes): The signature of the base data.
data (bytes): The target data to be reconstructed.
Returns:
bytes: The calculated delta.
Security:
Since this function uses the insecure MD4 hash algorithm, the resulting delta must not be
trusted to correctly reconstruct `data`. The delta might fail to apply or produce the wrong
data entirely. Always use another mechanism, like a cryptographic hash function, to validate
the final reconstructed data.
"""
...

def apply(base: bytes, delta: bytes) -> bytes:
"""
Apply `delta` to the base data `base` and return the result.
This function applies the provided delta to the base data and returns the reconstructed data.
Args:
base (bytes): The original base data.
delta (bytes): The delta to be applied.
Returns:
bytes: The reconstructed data after applying the delta.
Security:
This function should not be used with untrusted input, as a delta may create an arbitrarily
large output which can exhaust available memory. Use `apply_limited()` instead to set an upper
bound on the size of the output.
"""
...
24 changes: 23 additions & 1 deletion py_fast_rsync/signature.pyi
Original file line number Diff line number Diff line change
@@ -1 +1,23 @@
def calculate(data: bytes) -> bytes: ...
def calculate(data: bytes, block_size: int = 4096, crypto_hash_size=8) -> bytes:
"""
Compute an MD4 signature for the given data.
This function calculates an MD4 signature for the input data using the specified block size and crypto hash size.
Args:
data (bytes): The input data to compute the signature for.
block_size (int, optional): The granularity of the signature. Smaller block sizes yield larger,
but more precise, signatures. Defaults to 4096.
crypto_hash_size (int, optional): The number of bytes to use from the MD4 hash. Must be at most 16.
The larger this is, the less likely that a delta will be mis-applied.
Defaults to 8.
Returns:
bytes: The computed MD4 signature.
Raises:
ValueError: If block_size is not greater than zero or if crypto_hash_size is greater than 16.
Note:
This function may panic if the provided options are invalid.
"""
14 changes: 9 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,20 @@ fn py_fast_rsync(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
// Signature module

#[pyfunction]
fn calculate(py: Python, data: &[u8]) -> PyResult<Py<PyBytes>> {
// Calculate the signature and store it in a variable
#[pyo3(signature = (data, block_size=4096, crypto_hash_size=8))]
fn calculate(
py: Python,
data: &[u8],
block_size: u32,
crypto_hash_size: u32,
) -> PyResult<Py<PyBytes>> {
let signature = Signature::calculate(
data,
SignatureOptions {
block_size: 4096, // Adjust this based on your data characteristics
crypto_hash_size: 8,
block_size,
crypto_hash_size,
},
);

// Convert Vec<u8> to PyBytes and return
Ok(PyBytes::new_bound(py, &signature.into_serialized()).into())
}
17 changes: 17 additions & 0 deletions tests/test_fast_rsync.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,20 @@ def test_diff():

probably_data_b = py_fast_rsync.apply(data_a, delta)
assert probably_data_b == data_b


def test_optional_params():
data_a = b"hello world"
sig = signature.calculate(data_a, block_size=2048, crypto_hash_size=16)

# 2. take the signature for data_a and data_b
# and return a delta between data_a and data_b.
data_b = b"hello world!"
delta = py_fast_rsync.diff(sig, data_b)

# 3. apply the delta to data_a
# (usually) return data_b
# This function should not be used with untrusted input,

probably_data_b = py_fast_rsync.apply(data_a, delta)
assert probably_data_b == data_b

0 comments on commit 484cfc7

Please sign in to comment.